blob: 66915528298ef43fd0f97548e472707b4bd7800c [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
118#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100119#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100120
Ilya Dryomoved95b212016-08-12 16:40:02 +0200121#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
122 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100123 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100124 RBD_FEATURE_DATA_POOL | \
125 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700126
127/* Features supported by this (client software) implementation. */
128
Alex Elder770eba62012-10-25 23:34:40 -0500129#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700130
Alex Elder81a89792012-02-02 08:13:30 -0600131/*
132 * An RBD device name will be "rbd#", where the "rbd" comes from
133 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600134 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700135#define DEV_NAME_LEN 32
136
137/*
138 * block device image metadata (in-memory version)
139 */
140struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500141 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500142 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500144 u64 stripe_unit;
145 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100146 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500147 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700148
Alex Elderf84344f2012-08-31 17:29:51 -0500149 /* The remaining fields need to be updated occasionally */
150 u64 image_size;
151 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 char *snap_names; /* format 1 only */
153 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700154};
155
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500156/*
157 * An rbd image specification.
158 *
159 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500160 * identify an image. Each rbd_dev structure includes a pointer to
161 * an rbd_spec structure that encapsulates this identity.
162 *
163 * Each of the id's in an rbd_spec has an associated name. For a
164 * user-mapped image, the names are supplied and the id's associated
165 * with them are looked up. For a layered image, a parent image is
166 * defined by the tuple, and the names are looked up.
167 *
168 * An rbd_dev structure contains a parent_spec pointer which is
169 * non-null if the image it represents is a child in a layered
170 * image. This pointer will refer to the rbd_spec structure used
171 * by the parent rbd_dev for its own identity (i.e., the structure
172 * is shared between the parent and child).
173 *
174 * Since these structures are populated once, during the discovery
175 * phase of image construction, they are effectively immutable so
176 * we make no effort to synchronize access to them.
177 *
178 * Note that code herein does not assume the image name is known (it
179 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500180 */
181struct rbd_spec {
182 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500183 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200184 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500185
Alex Elderecb4dc22013-04-26 09:43:47 -0500186 const char *image_id;
187 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188
189 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500190 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500191
192 struct kref kref;
193};
194
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700195/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600196 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197 */
198struct rbd_client {
199 struct ceph_client *client;
200 struct kref kref;
201 struct list_head node;
202};
203
Alex Elderbf0d5f502012-11-22 00:00:08 -0600204struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600205
Alex Elder9969ebc2013-01-18 12:31:10 -0600206enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100207 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100208 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100209 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100210 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600211};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800213enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800215 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800216 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100217 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800218};
219
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100220/*
221 * Writes go through the following state machine to deal with
222 * layering:
223 *
224 * need copyup
225 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
226 * | ^ |
227 * v \------------------------------/
228 * done
229 * ^
230 * |
231 * RBD_OBJ_WRITE_FLAT
232 *
233 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
234 * there is a parent or not.
235 */
236enum rbd_obj_write_state {
237 RBD_OBJ_WRITE_FLAT = 1,
238 RBD_OBJ_WRITE_GUARD,
239 RBD_OBJ_WRITE_COPYUP,
Alex Elder926f9b32013-02-11 12:33:24 -0600240};
241
Alex Elderbf0d5f502012-11-22 00:00:08 -0600242struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100243 struct ceph_object_extent ex;
Alex Elderc5b5ef62013-02-11 12:33:24 -0600244 union {
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100245 bool tried_parent; /* for reads */
246 enum rbd_obj_write_state write_state; /* for writes */
247 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600248
Ilya Dryomov51c35092018-01-29 14:04:08 +0100249 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100250 struct ceph_file_extent *img_extents;
251 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600252
Alex Elder788e2df2013-01-17 12:25:27 -0600253 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100254 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600255 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100256 struct ceph_bvec_iter bvec_pos;
257 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100258 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600259 };
260 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100261 struct bio_vec *copyup_bvecs;
262 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600263
264 struct ceph_osd_request *osd_req;
265
266 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800267 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600268
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269 struct kref kref;
270};
271
Alex Elder0c425242013-02-08 09:55:49 -0600272enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600273 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600274 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600275};
276
Alex Elderbf0d5f502012-11-22 00:00:08 -0600277struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600278 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100279 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100280 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600281 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600282 union {
Alex Elder9849e982013-01-24 16:13:36 -0600283 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600284 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600285 };
286 union {
287 struct request *rq; /* block request */
288 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600289 };
Ilya Dryomov15961b42018-02-01 11:50:47 +0100290 spinlock_t completion_lock;
Alex Elder55f27e02013-04-10 12:34:25 -0500291 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600292 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100294 struct list_head object_extents; /* obj_req.ex structs */
Ilya Dryomov7114eda2018-02-01 11:50:47 +0100295 u32 pending_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600296
297 struct kref kref;
298};
299
300#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100301 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600302#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100303 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600304
Ilya Dryomov99d16942016-08-12 16:11:41 +0200305enum rbd_watch_state {
306 RBD_WATCH_STATE_UNREGISTERED,
307 RBD_WATCH_STATE_REGISTERED,
308 RBD_WATCH_STATE_ERROR,
309};
310
Ilya Dryomoved95b212016-08-12 16:40:02 +0200311enum rbd_lock_state {
312 RBD_LOCK_STATE_UNLOCKED,
313 RBD_LOCK_STATE_LOCKED,
314 RBD_LOCK_STATE_RELEASING,
315};
316
317/* WatchNotify::ClientId */
318struct rbd_client_id {
319 u64 gid;
320 u64 handle;
321};
322
Alex Elderf84344f2012-08-31 17:29:51 -0500323struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500324 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500325 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500326};
327
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700328/*
329 * a single device
330 */
331struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500332 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700333
334 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200335 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700336 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700337
Alex Eldera30b71b2012-07-10 20:30:11 -0500338 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700339 struct rbd_client *rbd_client;
340
341 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
342
Alex Elderb82d1672013-01-14 12:43:31 -0600343 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700344
345 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600346 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500347 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300348 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200349 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700350
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200351 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200352 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500353
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200354 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600355
Ilya Dryomov99d16942016-08-12 16:11:41 +0200356 struct mutex watch_mutex;
357 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200358 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200359 u64 watch_cookie;
360 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700361
Ilya Dryomoved95b212016-08-12 16:40:02 +0200362 struct rw_semaphore lock_rwsem;
363 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200364 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200365 struct rbd_client_id owner_cid;
366 struct work_struct acquired_lock_work;
367 struct work_struct released_lock_work;
368 struct delayed_work lock_dwork;
369 struct work_struct unlock_work;
370 wait_queue_head_t lock_waitq;
371
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200372 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700373
Alex Elder86b00e02012-10-25 23:34:42 -0500374 struct rbd_spec *parent_spec;
375 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500376 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500377 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500378
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100379 /* Block layer tags. */
380 struct blk_mq_tag_set tag_set;
381
Josh Durginc6666012011-11-21 17:11:12 -0800382 /* protects updating the header */
383 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500384
385 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700386
387 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800388
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800389 /* sysfs related */
390 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600391 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800392};
393
Alex Elderb82d1672013-01-14 12:43:31 -0600394/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200395 * Flag bits for rbd_dev->flags:
396 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
397 * by rbd_dev->lock
398 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600399 */
Alex Elder6d292902013-01-14 12:43:31 -0600400enum rbd_dev_flags {
401 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600402 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200403 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600404};
405
Alex Eldercfbf6372013-05-31 17:40:45 -0500406static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600407
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700408static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600409static DEFINE_SPINLOCK(rbd_dev_list_lock);
410
Alex Elder432b8582012-01-29 13:57:44 -0600411static LIST_HEAD(rbd_client_list); /* clients */
412static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700413
Alex Elder78c2a442013-05-01 12:43:04 -0500414/* Slab caches for frequently-allocated structures */
415
Alex Elder1c2a9df2013-05-01 12:43:03 -0500416static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500417static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500418
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200419static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200420static DEFINE_IDA(rbd_dev_id_ida);
421
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400422static struct workqueue_struct *rbd_wq;
423
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200424/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100425 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200426 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100427static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600428module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100429MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200430
Alex Elderf0f8cef2012-01-29 13:57:44 -0600431static ssize_t rbd_add(struct bus_type *bus, const char *buf,
432 size_t count);
433static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
434 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200435static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
436 size_t count);
437static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
438 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200439static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600440
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200441static int rbd_dev_id_to_minor(int dev_id)
442{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200443 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200444}
445
446static int minor_to_rbd_dev_id(int minor)
447{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200448 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200449}
450
Ilya Dryomoved95b212016-08-12 16:40:02 +0200451static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
452{
453 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
454 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
455}
456
457static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
458{
459 bool is_lock_owner;
460
461 down_read(&rbd_dev->lock_rwsem);
462 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
463 up_read(&rbd_dev->lock_rwsem);
464 return is_lock_owner;
465}
466
Ilya Dryomov8767b292017-03-02 19:56:57 +0100467static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
468{
469 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
470}
471
Joe Perches5657a812018-05-24 13:38:59 -0600472static BUS_ATTR(add, 0200, NULL, rbd_add);
473static BUS_ATTR(remove, 0200, NULL, rbd_remove);
474static BUS_ATTR(add_single_major, 0200, NULL, rbd_add_single_major);
475static BUS_ATTR(remove_single_major, 0200, NULL, rbd_remove_single_major);
476static BUS_ATTR(supported_features, 0444, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700477
478static struct attribute *rbd_bus_attrs[] = {
479 &bus_attr_add.attr,
480 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200481 &bus_attr_add_single_major.attr,
482 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100483 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700484 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600485};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200486
487static umode_t rbd_bus_is_visible(struct kobject *kobj,
488 struct attribute *attr, int index)
489{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200490 if (!single_major &&
491 (attr == &bus_attr_add_single_major.attr ||
492 attr == &bus_attr_remove_single_major.attr))
493 return 0;
494
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200495 return attr->mode;
496}
497
498static const struct attribute_group rbd_bus_group = {
499 .attrs = rbd_bus_attrs,
500 .is_visible = rbd_bus_is_visible,
501};
502__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600503
504static struct bus_type rbd_bus_type = {
505 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700506 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600507};
508
509static void rbd_root_dev_release(struct device *dev)
510{
511}
512
513static struct device rbd_root_dev = {
514 .init_name = "rbd",
515 .release = rbd_root_dev_release,
516};
517
Alex Elder06ecc6c2012-11-01 10:17:15 -0500518static __printf(2, 3)
519void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
520{
521 struct va_format vaf;
522 va_list args;
523
524 va_start(args, fmt);
525 vaf.fmt = fmt;
526 vaf.va = &args;
527
528 if (!rbd_dev)
529 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
530 else if (rbd_dev->disk)
531 printk(KERN_WARNING "%s: %s: %pV\n",
532 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
533 else if (rbd_dev->spec && rbd_dev->spec->image_name)
534 printk(KERN_WARNING "%s: image %s: %pV\n",
535 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
536 else if (rbd_dev->spec && rbd_dev->spec->image_id)
537 printk(KERN_WARNING "%s: id %s: %pV\n",
538 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
539 else /* punt */
540 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
541 RBD_DRV_NAME, rbd_dev, &vaf);
542 va_end(args);
543}
544
Alex Elderaafb2302012-09-06 16:00:54 -0500545#ifdef RBD_DEBUG
546#define rbd_assert(expr) \
547 if (unlikely(!(expr))) { \
548 printk(KERN_ERR "\nAssertion failure in %s() " \
549 "at line %d:\n\n" \
550 "\trbd_assert(%s);\n\n", \
551 __func__, __LINE__, #expr); \
552 BUG(); \
553 }
554#else /* !RBD_DEBUG */
555# define rbd_assert(expr) ((void) 0)
556#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800557
Alex Elder05a46af2013-04-26 15:44:36 -0500558static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600559
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500560static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500561static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400562static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400563static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500564static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
565 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500566static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
567 u8 *order, u64 *snap_size);
568static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
569 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700570
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700571static int rbd_open(struct block_device *bdev, fmode_t mode)
572{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600573 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600574 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700575
Alex Eldera14ea262013-02-05 13:23:12 -0600576 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600577 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
578 removing = true;
579 else
580 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600581 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600582 if (removing)
583 return -ENOENT;
584
Alex Elderc3e946c2012-11-16 09:29:16 -0600585 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700586
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700587 return 0;
588}
589
Al Virodb2a1442013-05-05 21:52:57 -0400590static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800591{
592 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600593 unsigned long open_count_before;
594
Alex Eldera14ea262013-02-05 13:23:12 -0600595 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600596 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600597 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600598 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800599
Alex Elderc3e946c2012-11-16 09:29:16 -0600600 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800601}
602
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800603static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
604{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200605 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800606
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200607 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800608 return -EFAULT;
609
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200610 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800611 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
612 return -EROFS;
613
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200614 /* Let blkdev_roset() handle it */
615 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800616}
617
618static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
619 unsigned int cmd, unsigned long arg)
620{
621 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200622 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800623
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800624 switch (cmd) {
625 case BLKROSET:
626 ret = rbd_ioctl_set_ro(rbd_dev, arg);
627 break;
628 default:
629 ret = -ENOTTY;
630 }
631
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800632 return ret;
633}
634
635#ifdef CONFIG_COMPAT
636static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
637 unsigned int cmd, unsigned long arg)
638{
639 return rbd_ioctl(bdev, mode, cmd, arg);
640}
641#endif /* CONFIG_COMPAT */
642
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700643static const struct block_device_operations rbd_bd_ops = {
644 .owner = THIS_MODULE,
645 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800646 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800647 .ioctl = rbd_ioctl,
648#ifdef CONFIG_COMPAT
649 .compat_ioctl = rbd_compat_ioctl,
650#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700651};
652
653/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500654 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500655 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700656 */
Alex Elderf8c38922012-08-10 13:12:07 -0700657static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700658{
659 struct rbd_client *rbdc;
660 int ret = -ENOMEM;
661
Alex Elder37206ee2013-02-20 17:32:08 -0600662 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700663 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
664 if (!rbdc)
665 goto out_opt;
666
667 kref_init(&rbdc->kref);
668 INIT_LIST_HEAD(&rbdc->node);
669
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100670 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700671 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500672 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500673 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700674
675 ret = ceph_open_session(rbdc->client);
676 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500677 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700678
Alex Elder432b8582012-01-29 13:57:44 -0600679 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700680 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600681 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700682
Alex Elder37206ee2013-02-20 17:32:08 -0600683 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600684
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700685 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500686out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700687 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500688out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700689 kfree(rbdc);
690out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500691 if (ceph_opts)
692 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600693 dout("%s: error %d\n", __func__, ret);
694
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400695 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700696}
697
Alex Elder2f82ee52012-10-30 19:40:33 -0500698static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
699{
700 kref_get(&rbdc->kref);
701
702 return rbdc;
703}
704
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700705/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700706 * Find a ceph client with specific addr and configuration. If
707 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700708 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700709static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700710{
711 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700712 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713
Alex Elder43ae4702012-07-03 16:01:18 -0500714 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700715 return NULL;
716
Alex Elder1f7ba332012-08-10 13:12:07 -0700717 spin_lock(&rbd_client_list_lock);
718 list_for_each_entry(client_node, &rbd_client_list, node) {
719 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500720 __rbd_get_client(client_node);
721
Alex Elder1f7ba332012-08-10 13:12:07 -0700722 found = true;
723 break;
724 }
725 }
726 spin_unlock(&rbd_client_list_lock);
727
728 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729}
730
731/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300732 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700733 */
734enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300735 Opt_queue_depth,
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100736 Opt_alloc_size,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400737 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700738 Opt_last_int,
739 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200740 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700741 Opt_last_string,
742 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700743 Opt_read_only,
744 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200745 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200746 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100747 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300748 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700749};
750
Alex Elder43ae4702012-07-03 16:01:18 -0500751static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300752 {Opt_queue_depth, "queue_depth=%d"},
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100753 {Opt_alloc_size, "alloc_size=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400754 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700755 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200756 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700757 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500758 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700759 {Opt_read_only, "ro"}, /* Alternate spelling */
760 {Opt_read_write, "read_write"},
761 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200762 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200763 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100764 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300765 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700766};
767
Alex Elder98571b52013-01-20 14:44:42 -0600768struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300769 int queue_depth;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100770 int alloc_size;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400771 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600772 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200773 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200774 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100775 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600776};
777
Ilya Dryomovb5584182015-06-23 16:21:19 +0300778#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100779#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400780#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600781#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200782#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200783#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100784#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600785
Ilya Dryomovc3001562018-07-03 15:28:43 +0200786struct parse_rbd_opts_ctx {
787 struct rbd_spec *spec;
788 struct rbd_options *opts;
789};
790
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700791static int parse_rbd_opts_token(char *c, void *private)
792{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200793 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700794 substring_t argstr[MAX_OPT_ARGS];
795 int token, intval, ret;
796
Alex Elder43ae4702012-07-03 16:01:18 -0500797 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700798 if (token < Opt_last_int) {
799 ret = match_int(&argstr[0], &intval);
800 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200801 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700802 return ret;
803 }
804 dout("got int token %d val %d\n", token, intval);
805 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300806 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700807 } else {
808 dout("got token %d\n", token);
809 }
810
811 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300812 case Opt_queue_depth:
813 if (intval < 1) {
814 pr_err("queue_depth out of range\n");
815 return -EINVAL;
816 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200817 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300818 break;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100819 case Opt_alloc_size:
820 if (intval < 1) {
821 pr_err("alloc_size out of range\n");
822 return -EINVAL;
823 }
824 if (!is_power_of_2(intval)) {
825 pr_err("alloc_size must be a power of 2\n");
826 return -EINVAL;
827 }
828 pctx->opts->alloc_size = intval;
829 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400830 case Opt_lock_timeout:
831 /* 0 is "wait forever" (i.e. infinite timeout) */
832 if (intval < 0 || intval > INT_MAX / 1000) {
833 pr_err("lock_timeout out of range\n");
834 return -EINVAL;
835 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200836 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400837 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200838 case Opt_pool_ns:
839 kfree(pctx->spec->pool_ns);
840 pctx->spec->pool_ns = match_strdup(argstr);
841 if (!pctx->spec->pool_ns)
842 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700843 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700844 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200845 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700846 break;
847 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200848 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700849 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200850 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200851 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200852 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200853 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200854 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200855 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100856 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200857 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100858 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700859 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300860 /* libceph prints "bad option" msg */
861 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700862 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300863
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700864 return 0;
865}
866
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800867static char* obj_op_name(enum obj_operation_type op_type)
868{
869 switch (op_type) {
870 case OBJ_OP_READ:
871 return "read";
872 case OBJ_OP_WRITE:
873 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800874 case OBJ_OP_DISCARD:
875 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100876 case OBJ_OP_ZEROOUT:
877 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800878 default:
879 return "???";
880 }
881}
882
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700883/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700884 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600885 *
Alex Elder432b8582012-01-29 13:57:44 -0600886 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700887 */
888static void rbd_client_release(struct kref *kref)
889{
890 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
891
Alex Elder37206ee2013-02-20 17:32:08 -0600892 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500893 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700894 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500895 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896
897 ceph_destroy_client(rbdc->client);
898 kfree(rbdc);
899}
900
901/*
902 * Drop reference to ceph client node. If it's not referenced anymore, release
903 * it.
904 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500905static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700906{
Alex Elderc53d5892012-10-25 23:34:42 -0500907 if (rbdc)
908 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700909}
910
Ilya Dryomovdd435852018-02-22 13:43:24 +0100911static int wait_for_latest_osdmap(struct ceph_client *client)
912{
913 u64 newest_epoch;
914 int ret;
915
916 ret = ceph_monc_get_version(&client->monc, "osdmap", &newest_epoch);
917 if (ret)
918 return ret;
919
920 if (client->osdc.osdmap->epoch >= newest_epoch)
921 return 0;
922
923 ceph_osdc_maybe_request_map(&client->osdc);
924 return ceph_monc_wait_osdmap(&client->monc, newest_epoch,
925 client->options->mount_timeout);
926}
927
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100928/*
929 * Get a ceph client with specific addr and configuration, if one does
930 * not exist create it. Either way, ceph_opts is consumed by this
931 * function.
932 */
933static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
934{
935 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100936 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100937
938 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
939 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100940 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100941 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100942
943 /*
944 * Using an existing client. Make sure ->pg_pools is up to
945 * date before we look up the pool id in do_rbd_add().
946 */
947 ret = wait_for_latest_osdmap(rbdc->client);
948 if (ret) {
949 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
950 rbd_put_client(rbdc);
951 rbdc = ERR_PTR(ret);
952 }
953 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100954 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100955 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100956 mutex_unlock(&client_mutex);
957
958 return rbdc;
959}
960
Alex Eldera30b71b2012-07-10 20:30:11 -0500961static bool rbd_image_format_valid(u32 image_format)
962{
963 return image_format == 1 || image_format == 2;
964}
965
Alex Elder8e94af82012-07-25 09:32:40 -0500966static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
967{
Alex Elder103a1502012-08-02 11:29:45 -0500968 size_t size;
969 u32 snap_count;
970
971 /* The header has to start with the magic rbd header text */
972 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
973 return false;
974
Alex Elderdb2388b2012-10-20 22:17:27 -0500975 /* The bio layer requires at least sector-sized I/O */
976
977 if (ondisk->options.order < SECTOR_SHIFT)
978 return false;
979
980 /* If we use u64 in a few spots we may be able to loosen this */
981
982 if (ondisk->options.order > 8 * sizeof (int) - 1)
983 return false;
984
Alex Elder103a1502012-08-02 11:29:45 -0500985 /*
986 * The size of a snapshot header has to fit in a size_t, and
987 * that limits the number of snapshots.
988 */
989 snap_count = le32_to_cpu(ondisk->snap_count);
990 size = SIZE_MAX - sizeof (struct ceph_snap_context);
991 if (snap_count > size / sizeof (__le64))
992 return false;
993
994 /*
995 * Not only that, but the size of the entire the snapshot
996 * header must also be representable in a size_t.
997 */
998 size -= snap_count * sizeof (__le64);
999 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1000 return false;
1001
1002 return true;
Alex Elder8e94af82012-07-25 09:32:40 -05001003}
1004
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001005/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001006 * returns the size of an object in the image
1007 */
1008static u32 rbd_obj_bytes(struct rbd_image_header *header)
1009{
1010 return 1U << header->obj_order;
1011}
1012
Ilya Dryomov263423f2017-01-25 18:16:22 +01001013static void rbd_init_layout(struct rbd_device *rbd_dev)
1014{
1015 if (rbd_dev->header.stripe_unit == 0 ||
1016 rbd_dev->header.stripe_count == 0) {
1017 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1018 rbd_dev->header.stripe_count = 1;
1019 }
1020
1021 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1022 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1023 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001024 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1025 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001026 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1027}
1028
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001029/*
Alex Elderbb23e372013-05-06 09:51:29 -05001030 * Fill an rbd image header with information from the given format 1
1031 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001032 */
Alex Elder662518b2013-05-06 09:51:29 -05001033static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001034 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001035{
Alex Elder662518b2013-05-06 09:51:29 -05001036 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001037 bool first_time = header->object_prefix == NULL;
1038 struct ceph_snap_context *snapc;
1039 char *object_prefix = NULL;
1040 char *snap_names = NULL;
1041 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001042 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001043 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001044 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001045
Alex Elderbb23e372013-05-06 09:51:29 -05001046 /* Allocate this now to avoid having to handle failure below */
1047
1048 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001049 object_prefix = kstrndup(ondisk->object_prefix,
1050 sizeof(ondisk->object_prefix),
1051 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001052 if (!object_prefix)
1053 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001054 }
1055
1056 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001057
Alex Elder103a1502012-08-02 11:29:45 -05001058 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001059 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1060 if (!snapc)
1061 goto out_err;
1062 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001063 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001064 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001065 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1066
Alex Elderbb23e372013-05-06 09:51:29 -05001067 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001068
Alex Elderbb23e372013-05-06 09:51:29 -05001069 if (snap_names_len > (u64)SIZE_MAX)
1070 goto out_2big;
1071 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1072 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001073 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001074
1075 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001076 snap_sizes = kmalloc_array(snap_count,
1077 sizeof(*header->snap_sizes),
1078 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001079 if (!snap_sizes)
1080 goto out_err;
1081
Alex Elderf785cc12012-08-23 23:22:06 -05001082 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001083 * Copy the names, and fill in each snapshot's id
1084 * and size.
1085 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001086 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001087 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001088 * snap_names_len bytes beyond the end of the
1089 * snapshot id array, this memcpy() is safe.
1090 */
Alex Elderbb23e372013-05-06 09:51:29 -05001091 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1092 snaps = ondisk->snaps;
1093 for (i = 0; i < snap_count; i++) {
1094 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1095 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1096 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097 }
Alex Elder849b4262012-07-09 21:04:24 -05001098
Alex Elderbb23e372013-05-06 09:51:29 -05001099 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001100
Alex Elderbb23e372013-05-06 09:51:29 -05001101 if (first_time) {
1102 header->object_prefix = object_prefix;
1103 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001104 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001105 } else {
1106 ceph_put_snap_context(header->snapc);
1107 kfree(header->snap_names);
1108 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001109 }
1110
1111 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001112
Alex Elderf84344f2012-08-31 17:29:51 -05001113 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001114 header->snapc = snapc;
1115 header->snap_names = snap_names;
1116 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001117
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001118 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001119out_2big:
1120 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001121out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001122 kfree(snap_sizes);
1123 kfree(snap_names);
1124 ceph_put_snap_context(snapc);
1125 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001126
Alex Elderbb23e372013-05-06 09:51:29 -05001127 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001128}
1129
Alex Elder9682fc62013-04-30 00:44:33 -05001130static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1131{
1132 const char *snap_name;
1133
1134 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1135
1136 /* Skip over names until we find the one we are looking for */
1137
1138 snap_name = rbd_dev->header.snap_names;
1139 while (which--)
1140 snap_name += strlen(snap_name) + 1;
1141
1142 return kstrdup(snap_name, GFP_KERNEL);
1143}
1144
Alex Elder30d1cff2013-05-01 12:43:03 -05001145/*
1146 * Snapshot id comparison function for use with qsort()/bsearch().
1147 * Note that result is for snapshots in *descending* order.
1148 */
1149static int snapid_compare_reverse(const void *s1, const void *s2)
1150{
1151 u64 snap_id1 = *(u64 *)s1;
1152 u64 snap_id2 = *(u64 *)s2;
1153
1154 if (snap_id1 < snap_id2)
1155 return 1;
1156 return snap_id1 == snap_id2 ? 0 : -1;
1157}
1158
1159/*
1160 * Search a snapshot context to see if the given snapshot id is
1161 * present.
1162 *
1163 * Returns the position of the snapshot id in the array if it's found,
1164 * or BAD_SNAP_INDEX otherwise.
1165 *
1166 * Note: The snapshot array is in kept sorted (by the osd) in
1167 * reverse order, highest snapshot id first.
1168 */
Alex Elder9682fc62013-04-30 00:44:33 -05001169static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1170{
1171 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001172 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001173
Alex Elder30d1cff2013-05-01 12:43:03 -05001174 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1175 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001176
Alex Elder30d1cff2013-05-01 12:43:03 -05001177 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001178}
1179
Alex Elder2ad3d712013-04-30 00:44:33 -05001180static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1181 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001182{
1183 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001184 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001185
1186 which = rbd_dev_snap_index(rbd_dev, snap_id);
1187 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001188 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001189
Josh Durginda6a6b62013-09-04 17:57:31 -07001190 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1191 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001192}
1193
Alex Elder9e15b772012-10-30 19:40:33 -05001194static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1195{
Alex Elder9e15b772012-10-30 19:40:33 -05001196 if (snap_id == CEPH_NOSNAP)
1197 return RBD_SNAP_HEAD_NAME;
1198
Alex Elder54cac612013-04-30 00:44:33 -05001199 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1200 if (rbd_dev->image_format == 1)
1201 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001202
Alex Elder54cac612013-04-30 00:44:33 -05001203 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001204}
1205
Alex Elder2ad3d712013-04-30 00:44:33 -05001206static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1207 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001208{
Alex Elder2ad3d712013-04-30 00:44:33 -05001209 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1210 if (snap_id == CEPH_NOSNAP) {
1211 *snap_size = rbd_dev->header.image_size;
1212 } else if (rbd_dev->image_format == 1) {
1213 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001214
Alex Elder2ad3d712013-04-30 00:44:33 -05001215 which = rbd_dev_snap_index(rbd_dev, snap_id);
1216 if (which == BAD_SNAP_INDEX)
1217 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001218
Alex Elder2ad3d712013-04-30 00:44:33 -05001219 *snap_size = rbd_dev->header.snap_sizes[which];
1220 } else {
1221 u64 size = 0;
1222 int ret;
1223
1224 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1225 if (ret)
1226 return ret;
1227
1228 *snap_size = size;
1229 }
1230 return 0;
1231}
1232
1233static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1234 u64 *snap_features)
1235{
1236 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1237 if (snap_id == CEPH_NOSNAP) {
1238 *snap_features = rbd_dev->header.features;
1239 } else if (rbd_dev->image_format == 1) {
1240 *snap_features = 0; /* No features for format 1 */
1241 } else {
1242 u64 features = 0;
1243 int ret;
1244
1245 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1246 if (ret)
1247 return ret;
1248
1249 *snap_features = features;
1250 }
1251 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001252}
1253
Alex Elderd1cf5782013-04-27 09:59:30 -05001254static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001255{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001256 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001257 u64 size = 0;
1258 u64 features = 0;
1259 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001260
Alex Elder2ad3d712013-04-30 00:44:33 -05001261 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1262 if (ret)
1263 return ret;
1264 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1265 if (ret)
1266 return ret;
1267
1268 rbd_dev->mapping.size = size;
1269 rbd_dev->mapping.features = features;
1270
Alex Elder8b0241f2013-04-25 23:15:08 -05001271 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001272}
1273
Alex Elderd1cf5782013-04-27 09:59:30 -05001274static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1275{
1276 rbd_dev->mapping.size = 0;
1277 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001278}
1279
Ilya Dryomov5359a172018-01-20 10:30:10 +01001280static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001281{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001282 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001283 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001284
Ilya Dryomov5359a172018-01-20 10:30:10 +01001285 buf = bvec_kmap_irq(bv, &flags);
1286 memset(buf, 0, bv->bv_len);
1287 flush_dcache_page(bv->bv_page);
1288 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001289}
1290
Ilya Dryomov5359a172018-01-20 10:30:10 +01001291static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001292{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001293 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001294
Ilya Dryomov5359a172018-01-20 10:30:10 +01001295 ceph_bio_iter_advance(&it, off);
1296 ceph_bio_iter_advance_step(&it, bytes, ({
1297 zero_bvec(&bv);
1298 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001299}
1300
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001301static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001302{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001303 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001304
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001305 ceph_bvec_iter_advance(&it, off);
1306 ceph_bvec_iter_advance_step(&it, bytes, ({
1307 zero_bvec(&bv);
1308 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001309}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001310
Alex Elderf7760da2012-10-20 22:17:27 -05001311/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001312 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001313 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001314 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001315 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001316 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001317static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1318 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001319{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001320 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001321 case OBJ_REQUEST_BIO:
1322 zero_bios(&obj_req->bio_pos, off, bytes);
1323 break;
1324 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001325 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001326 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1327 break;
1328 default:
1329 rbd_assert(0);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001330 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001331}
1332
1333static void rbd_obj_request_destroy(struct kref *kref);
1334static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1335{
1336 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001337 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001338 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001339 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1340}
1341
Alex Elder0f2d5be2014-04-26 14:21:44 +04001342static void rbd_img_request_get(struct rbd_img_request *img_request)
1343{
1344 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001345 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001346 kref_get(&img_request->kref);
1347}
1348
Alex Elderbf0d5f502012-11-22 00:00:08 -06001349static void rbd_img_request_destroy(struct kref *kref);
1350static void rbd_img_request_put(struct rbd_img_request *img_request)
1351{
1352 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001353 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001354 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001355 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001356}
1357
1358static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1359 struct rbd_obj_request *obj_request)
1360{
Alex Elder25dcf952013-01-25 17:08:55 -06001361 rbd_assert(obj_request->img_request == NULL);
1362
Alex Elderb155e862013-04-15 14:50:37 -05001363 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001364 obj_request->img_request = img_request;
Ilya Dryomov7114eda2018-02-01 11:50:47 +01001365 img_request->pending_count++;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001366 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001367}
1368
1369static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1370 struct rbd_obj_request *obj_request)
1371{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001372 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001373 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001374 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001375 rbd_obj_request_put(obj_request);
1376}
1377
Ilya Dryomov980917f2016-09-12 18:59:42 +02001378static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001379{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001380 struct ceph_osd_request *osd_req = obj_request->osd_req;
1381
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001382 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001383 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1384 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001385 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001386}
1387
Alex Elder0c425242013-02-08 09:55:49 -06001388/*
1389 * The default/initial value for all image request flags is 0. Each
1390 * is conditionally set to 1 at image request initialization time
1391 * and currently never change thereafter.
1392 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001393static void img_request_layered_set(struct rbd_img_request *img_request)
1394{
1395 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1396 smp_mb();
1397}
1398
Alex Eldera2acd002013-05-08 22:50:04 -05001399static void img_request_layered_clear(struct rbd_img_request *img_request)
1400{
1401 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1402 smp_mb();
1403}
1404
Alex Elderd0b2e942013-01-24 16:13:36 -06001405static bool img_request_layered_test(struct rbd_img_request *img_request)
1406{
1407 smp_mb();
1408 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1409}
1410
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001411static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001412{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001413 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1414
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001415 return !obj_req->ex.oe_off &&
1416 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001417}
1418
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001419static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001420{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001421 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001422
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001423 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001424 rbd_dev->layout.object_size;
1425}
1426
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001427static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1428{
1429 return ceph_file_extents_bytes(obj_req->img_extents,
1430 obj_req->num_img_extents);
1431}
1432
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001433static bool rbd_img_is_write(struct rbd_img_request *img_req)
1434{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001435 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001436 case OBJ_OP_READ:
1437 return false;
1438 case OBJ_OP_WRITE:
1439 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001440 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001441 return true;
1442 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001443 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001444 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001445}
1446
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001447static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
Ilya Dryomov27617132015-07-16 17:36:11 +03001448
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001449static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001450{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001451 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001452
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001453 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1454 osd_req->r_result, obj_req);
1455 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001456
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001457 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1458 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1459 obj_req->xferred = osd_req->r_result;
1460 else
1461 /*
1462 * Writes aren't allowed to return a data payload. In some
1463 * guarded write cases (e.g. stat + zero on an empty object)
1464 * a stat response makes it through, but we don't care.
1465 */
1466 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001467
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001468 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001469}
1470
Alex Elder9d4df012013-04-19 15:34:50 -05001471static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001472{
Alex Elder8c042b02013-04-03 01:28:58 -05001473 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001474
Ilya Dryomova162b302018-01-30 17:52:10 +01001475 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001476 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001477}
1478
1479static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1480{
Alex Elder9d4df012013-04-19 15:34:50 -05001481 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001482
Ilya Dryomova162b302018-01-30 17:52:10 +01001483 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001484 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001485 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001486}
1487
Ilya Dryomovbc812072017-01-25 18:16:23 +01001488static struct ceph_osd_request *
Ilya Dryomove28eded2019-02-25 11:42:26 +01001489__rbd_osd_req_create(struct rbd_obj_request *obj_req,
1490 struct ceph_snap_context *snapc, unsigned int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001491{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001492 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001493 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1494 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001495 const char *name_format = rbd_dev->image_format == 1 ?
1496 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001497
Ilya Dryomove28eded2019-02-25 11:42:26 +01001498 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001499 if (!req)
1500 return NULL;
1501
Ilya Dryomovbc812072017-01-25 18:16:23 +01001502 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001503 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001504
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001505 /*
1506 * Data objects may be stored in a separate pool, but always in
1507 * the same namespace in that pool as the header in its pool.
1508 */
1509 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001510 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001511
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001512 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001513 rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001514 goto err_req;
1515
Ilya Dryomovbc812072017-01-25 18:16:23 +01001516 return req;
1517
1518err_req:
1519 ceph_osdc_put_request(req);
1520 return NULL;
1521}
1522
Ilya Dryomove28eded2019-02-25 11:42:26 +01001523static struct ceph_osd_request *
1524rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
1525{
1526 return __rbd_osd_req_create(obj_req, obj_req->img_request->snapc,
1527 num_ops);
1528}
1529
Alex Elderbf0d5f502012-11-22 00:00:08 -06001530static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1531{
1532 ceph_osdc_put_request(osd_req);
1533}
1534
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001535static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001536{
1537 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001538
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001539 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001540 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001541 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001542
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001543 ceph_object_extent_init(&obj_request->ex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001544 kref_init(&obj_request->kref);
1545
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001546 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001547 return obj_request;
1548}
1549
1550static void rbd_obj_request_destroy(struct kref *kref)
1551{
1552 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001553 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001554
1555 obj_request = container_of(kref, struct rbd_obj_request, kref);
1556
Alex Elder37206ee2013-02-20 17:32:08 -06001557 dout("%s: obj %p\n", __func__, obj_request);
1558
Alex Elderbf0d5f502012-11-22 00:00:08 -06001559 if (obj_request->osd_req)
1560 rbd_osd_req_destroy(obj_request->osd_req);
1561
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001562 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001563 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001564 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001565 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001566 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001567 case OBJ_REQUEST_OWN_BVECS:
1568 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001569 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001570 default:
1571 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001572 }
1573
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001574 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001575 if (obj_request->copyup_bvecs) {
1576 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1577 if (obj_request->copyup_bvecs[i].bv_page)
1578 __free_page(obj_request->copyup_bvecs[i].bv_page);
1579 }
1580 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001581 }
1582
Alex Elder868311b2013-05-01 12:43:03 -05001583 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001584}
1585
Alex Elderfb65d2282013-05-08 22:50:04 -05001586/* It's OK to call this for a device with no parent */
1587
1588static void rbd_spec_put(struct rbd_spec *spec);
1589static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1590{
1591 rbd_dev_remove_parent(rbd_dev);
1592 rbd_spec_put(rbd_dev->parent_spec);
1593 rbd_dev->parent_spec = NULL;
1594 rbd_dev->parent_overlap = 0;
1595}
1596
Alex Elderbf0d5f502012-11-22 00:00:08 -06001597/*
Alex Eldera2acd002013-05-08 22:50:04 -05001598 * Parent image reference counting is used to determine when an
1599 * image's parent fields can be safely torn down--after there are no
1600 * more in-flight requests to the parent image. When the last
1601 * reference is dropped, cleaning them up is safe.
1602 */
1603static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1604{
1605 int counter;
1606
1607 if (!rbd_dev->parent_spec)
1608 return;
1609
1610 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1611 if (counter > 0)
1612 return;
1613
1614 /* Last reference; clean up parent data structures */
1615
1616 if (!counter)
1617 rbd_dev_unparent(rbd_dev);
1618 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001619 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001620}
1621
1622/*
1623 * If an image has a non-zero parent overlap, get a reference to its
1624 * parent.
1625 *
1626 * Returns true if the rbd device has a parent with a non-zero
1627 * overlap and a reference for it was successfully taken, or
1628 * false otherwise.
1629 */
1630static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1631{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001632 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001633
1634 if (!rbd_dev->parent_spec)
1635 return false;
1636
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001637 down_read(&rbd_dev->header_rwsem);
1638 if (rbd_dev->parent_overlap)
1639 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1640 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001641
1642 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001643 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001644
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001645 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001646}
1647
Alex Elderbf0d5f502012-11-22 00:00:08 -06001648/*
1649 * Caller is responsible for filling in the list of object requests
1650 * that comprises the image request, and the Linux request pointer
1651 * (if there is one).
1652 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001653static struct rbd_img_request *rbd_img_request_create(
1654 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001655 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001656 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001657{
1658 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001659
Ilya Dryomova0c58952018-01-22 16:03:06 +01001660 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001661 if (!img_request)
1662 return NULL;
1663
Alex Elderbf0d5f502012-11-22 00:00:08 -06001664 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001665 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001666 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001667 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001668 else
1669 img_request->snapc = snapc;
1670
Alex Eldera2acd002013-05-08 22:50:04 -05001671 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001672 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001673
Alex Elderbf0d5f502012-11-22 00:00:08 -06001674 spin_lock_init(&img_request->completion_lock);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001675 INIT_LIST_HEAD(&img_request->object_extents);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001676 kref_init(&img_request->kref);
1677
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001678 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1679 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001680 return img_request;
1681}
1682
1683static void rbd_img_request_destroy(struct kref *kref)
1684{
1685 struct rbd_img_request *img_request;
1686 struct rbd_obj_request *obj_request;
1687 struct rbd_obj_request *next_obj_request;
1688
1689 img_request = container_of(kref, struct rbd_img_request, kref);
1690
Alex Elder37206ee2013-02-20 17:32:08 -06001691 dout("%s: img %p\n", __func__, img_request);
1692
Alex Elderbf0d5f502012-11-22 00:00:08 -06001693 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1694 rbd_img_obj_request_del(img_request, obj_request);
1695
Alex Eldera2acd002013-05-08 22:50:04 -05001696 if (img_request_layered_test(img_request)) {
1697 img_request_layered_clear(img_request);
1698 rbd_dev_parent_put(img_request->rbd_dev);
1699 }
1700
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001701 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001702 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001703
Alex Elder1c2a9df2013-05-01 12:43:03 -05001704 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001705}
1706
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001707static void prune_extents(struct ceph_file_extent *img_extents,
1708 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001709{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001710 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001711
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001712 /* drop extents completely beyond the overlap */
1713 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1714 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001715
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001716 if (cnt) {
1717 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001718
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001719 /* trim final overlapping extent */
1720 if (ex->fe_off + ex->fe_len > overlap)
1721 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001722 }
1723
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001724 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001725}
1726
Alex Elderf1a47392013-04-19 15:34:50 -05001727/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001728 * Determine the byte range(s) covered by either just the object extent
1729 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001730 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001731static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1732 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001733{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001734 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001735 int ret;
1736
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001737 if (!rbd_dev->parent_overlap)
1738 return 0;
1739
1740 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1741 entire ? 0 : obj_req->ex.oe_off,
1742 entire ? rbd_dev->layout.object_size :
1743 obj_req->ex.oe_len,
1744 &obj_req->img_extents,
1745 &obj_req->num_img_extents);
1746 if (ret)
1747 return ret;
1748
1749 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1750 rbd_dev->parent_overlap);
1751 return 0;
1752}
1753
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001754static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
1755{
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001756 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001757 case OBJ_REQUEST_BIO:
1758 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
1759 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001760 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001761 break;
1762 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001763 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001764 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001765 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001766 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001767 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
1768 &obj_req->bvec_pos);
1769 break;
1770 default:
1771 rbd_assert(0);
1772 }
1773}
1774
1775static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1776{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001777 obj_req->osd_req = __rbd_osd_req_create(obj_req, NULL, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001778 if (!obj_req->osd_req)
Ilya Dryomov710214e2016-09-15 17:53:32 +02001779 return -ENOMEM;
1780
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001781 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001782 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001783 rbd_osd_req_setup_data(obj_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001784
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001785 rbd_osd_req_format_read(obj_req);
1786 return 0;
1787}
1788
1789static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
1790 unsigned int which)
1791{
1792 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001793
Alex Elderc5b5ef62013-02-11 12:33:24 -06001794 /*
1795 * The response data for a STAT call consists of:
1796 * le64 length;
1797 * struct {
1798 * le32 tv_sec;
1799 * le32 tv_nsec;
1800 * } mtime;
1801 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001802 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1803 if (IS_ERR(pages))
1804 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001805
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001806 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
1807 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
1808 8 + sizeof(struct ceph_timespec),
1809 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001810 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001811}
1812
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001813static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
1814 unsigned int which)
Alex Elderb454e362013-04-19 15:34:50 -05001815{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001816 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1817 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001818
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001819 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
1820 rbd_dev->layout.object_size,
1821 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001822
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001823 if (rbd_obj_is_entire(obj_req))
1824 opcode = CEPH_OSD_OP_WRITEFULL;
1825 else
1826 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001827
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001828 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001829 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001830 rbd_osd_req_setup_data(obj_req, which++);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001831
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001832 rbd_assert(which == obj_req->osd_req->r_num_ops);
1833 rbd_osd_req_format_write(obj_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001834}
1835
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001836static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001837{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001838 unsigned int num_osd_ops, which = 0;
1839 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001840
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001841 /* reverse map the entire object onto the parent */
1842 ret = rbd_obj_calc_img_extents(obj_req, true);
1843 if (ret)
1844 return ret;
1845
1846 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001847 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1848 num_osd_ops = 3; /* stat + setallochint + write/writefull */
1849 } else {
1850 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1851 num_osd_ops = 2; /* setallochint + write/writefull */
1852 }
1853
Ilya Dryomova162b302018-01-30 17:52:10 +01001854 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001855 if (!obj_req->osd_req)
1856 return -ENOMEM;
1857
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001858 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001859 ret = __rbd_obj_setup_stat(obj_req, which++);
1860 if (ret)
1861 return ret;
1862 }
1863
1864 __rbd_obj_setup_write(obj_req, which);
1865 return 0;
1866}
1867
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001868static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1869{
1870 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1871 CEPH_OSD_OP_ZERO;
1872}
1873
1874static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1875{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001876 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1877 u64 off = obj_req->ex.oe_off;
1878 u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001879 int ret;
1880
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001881 /*
1882 * Align the range to alloc_size boundary and punt on discards
1883 * that are too small to free up any space.
1884 *
1885 * alloc_size == object_size && is_tail() is a special case for
1886 * filestore with filestore_punch_hole = false, needed to allow
1887 * truncate (in addition to delete).
1888 */
1889 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1890 !rbd_obj_is_tail(obj_req)) {
1891 off = round_up(off, rbd_dev->opts->alloc_size);
1892 next_off = round_down(next_off, rbd_dev->opts->alloc_size);
1893 if (off >= next_off)
1894 return 1;
1895 }
1896
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001897 /* reverse map the entire object onto the parent */
1898 ret = rbd_obj_calc_img_extents(obj_req, true);
1899 if (ret)
1900 return ret;
1901
1902 obj_req->osd_req = rbd_osd_req_create(obj_req, 1);
1903 if (!obj_req->osd_req)
1904 return -ENOMEM;
1905
1906 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
1907 osd_req_op_init(obj_req->osd_req, 0, CEPH_OSD_OP_DELETE, 0);
1908 } else {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001909 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1910 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1911 off, next_off - off);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001912 osd_req_op_extent_init(obj_req->osd_req, 0,
1913 truncate_or_zero_opcode(obj_req),
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001914 off, next_off - off, 0, 0);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001915 }
1916
1917 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1918 rbd_osd_req_format_write(obj_req);
1919 return 0;
1920}
1921
1922static void __rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001923 unsigned int which)
1924{
1925 u16 opcode;
1926
1927 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001928 if (obj_req->num_img_extents) {
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001929 osd_req_op_init(obj_req->osd_req, which++,
1930 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001931 opcode = CEPH_OSD_OP_TRUNCATE;
1932 } else {
1933 osd_req_op_init(obj_req->osd_req, which++,
1934 CEPH_OSD_OP_DELETE, 0);
1935 opcode = 0;
1936 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001937 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001938 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001939 }
1940
1941 if (opcode)
1942 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001943 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001944 0, 0);
1945
1946 rbd_assert(which == obj_req->osd_req->r_num_ops);
1947 rbd_osd_req_format_write(obj_req);
1948}
1949
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001950static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001951{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001952 unsigned int num_osd_ops, which = 0;
1953 int ret;
1954
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001955 /* reverse map the entire object onto the parent */
1956 ret = rbd_obj_calc_img_extents(obj_req, true);
1957 if (ret)
1958 return ret;
1959
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001960 if (rbd_obj_is_entire(obj_req)) {
1961 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
Ilya Dryomov2bb1e562018-02-06 19:26:34 +01001962 if (obj_req->num_img_extents)
1963 num_osd_ops = 2; /* create + truncate */
1964 else
1965 num_osd_ops = 1; /* delete */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001966 } else {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001967 if (obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001968 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
1969 num_osd_ops = 2; /* stat + truncate/zero */
1970 } else {
1971 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
1972 num_osd_ops = 1; /* truncate/zero */
1973 }
1974 }
1975
Ilya Dryomova162b302018-01-30 17:52:10 +01001976 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001977 if (!obj_req->osd_req)
1978 return -ENOMEM;
1979
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001980 if (!rbd_obj_is_entire(obj_req) && obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001981 ret = __rbd_obj_setup_stat(obj_req, which++);
1982 if (ret)
1983 return ret;
1984 }
1985
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001986 __rbd_obj_setup_zeroout(obj_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001987 return 0;
1988}
1989
1990/*
1991 * For each object request in @img_req, allocate an OSD request, add
1992 * individual OSD ops and prepare them for submission. The number of
1993 * OSD ops depends on op_type and the overlap point (if any).
1994 */
1995static int __rbd_img_fill_request(struct rbd_img_request *img_req)
1996{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001997 struct rbd_obj_request *obj_req, *next_obj_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001998 int ret;
1999
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002000 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002001 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002002 case OBJ_OP_READ:
2003 ret = rbd_obj_setup_read(obj_req);
2004 break;
2005 case OBJ_OP_WRITE:
2006 ret = rbd_obj_setup_write(obj_req);
2007 break;
2008 case OBJ_OP_DISCARD:
2009 ret = rbd_obj_setup_discard(obj_req);
2010 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002011 case OBJ_OP_ZEROOUT:
2012 ret = rbd_obj_setup_zeroout(obj_req);
2013 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002014 default:
2015 rbd_assert(0);
2016 }
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002017 if (ret < 0)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002018 return ret;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002019 if (ret > 0) {
2020 img_req->xferred += obj_req->ex.oe_len;
2021 img_req->pending_count--;
2022 rbd_img_obj_request_del(img_req, obj_req);
2023 continue;
2024 }
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002025
2026 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
2027 if (ret)
2028 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002029 }
2030
2031 return 0;
2032}
2033
Ilya Dryomov5a237812018-02-06 19:26:34 +01002034union rbd_img_fill_iter {
2035 struct ceph_bio_iter bio_iter;
2036 struct ceph_bvec_iter bvec_iter;
2037};
2038
2039struct rbd_img_fill_ctx {
2040 enum obj_request_type pos_type;
2041 union rbd_img_fill_iter *pos;
2042 union rbd_img_fill_iter iter;
2043 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01002044 ceph_object_extent_fn_t count_fn;
2045 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01002046};
2047
2048static struct ceph_object_extent *alloc_object_extent(void *arg)
2049{
2050 struct rbd_img_request *img_req = arg;
2051 struct rbd_obj_request *obj_req;
2052
2053 obj_req = rbd_obj_request_create();
2054 if (!obj_req)
2055 return NULL;
2056
2057 rbd_img_obj_request_add(img_req, obj_req);
2058 return &obj_req->ex;
2059}
2060
2061/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002062 * While su != os && sc == 1 is technically not fancy (it's the same
2063 * layout as su == os && sc == 1), we can't use the nocopy path for it
2064 * because ->set_pos_fn() should be called only once per object.
2065 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2066 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002067 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002068static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2069{
2070 return l->stripe_unit != l->object_size;
2071}
2072
2073static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2074 struct ceph_file_extent *img_extents,
2075 u32 num_img_extents,
2076 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002077{
2078 u32 i;
2079 int ret;
2080
2081 img_req->data_type = fctx->pos_type;
2082
2083 /*
2084 * Create object requests and set each object request's starting
2085 * position in the provided bio (list) or bio_vec array.
2086 */
2087 fctx->iter = *fctx->pos;
2088 for (i = 0; i < num_img_extents; i++) {
2089 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2090 img_extents[i].fe_off,
2091 img_extents[i].fe_len,
2092 &img_req->object_extents,
2093 alloc_object_extent, img_req,
2094 fctx->set_pos_fn, &fctx->iter);
2095 if (ret)
2096 return ret;
2097 }
2098
2099 return __rbd_img_fill_request(img_req);
2100}
2101
Ilya Dryomovafb97882018-02-06 19:26:35 +01002102/*
2103 * Map a list of image extents to a list of object extents, create the
2104 * corresponding object requests (normally each to a different object,
2105 * but not always) and add them to @img_req. For each object request,
2106 * set up its data descriptor to point to the corresponding chunk(s) of
2107 * @fctx->pos data buffer.
2108 *
2109 * Because ceph_file_to_extents() will merge adjacent object extents
2110 * together, each object request's data descriptor may point to multiple
2111 * different chunks of @fctx->pos data buffer.
2112 *
2113 * @fctx->pos data buffer is assumed to be large enough.
2114 */
2115static int rbd_img_fill_request(struct rbd_img_request *img_req,
2116 struct ceph_file_extent *img_extents,
2117 u32 num_img_extents,
2118 struct rbd_img_fill_ctx *fctx)
2119{
2120 struct rbd_device *rbd_dev = img_req->rbd_dev;
2121 struct rbd_obj_request *obj_req;
2122 u32 i;
2123 int ret;
2124
2125 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2126 !rbd_layout_is_fancy(&rbd_dev->layout))
2127 return rbd_img_fill_request_nocopy(img_req, img_extents,
2128 num_img_extents, fctx);
2129
2130 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2131
2132 /*
2133 * Create object requests and determine ->bvec_count for each object
2134 * request. Note that ->bvec_count sum over all object requests may
2135 * be greater than the number of bio_vecs in the provided bio (list)
2136 * or bio_vec array because when mapped, those bio_vecs can straddle
2137 * stripe unit boundaries.
2138 */
2139 fctx->iter = *fctx->pos;
2140 for (i = 0; i < num_img_extents; i++) {
2141 ret = ceph_file_to_extents(&rbd_dev->layout,
2142 img_extents[i].fe_off,
2143 img_extents[i].fe_len,
2144 &img_req->object_extents,
2145 alloc_object_extent, img_req,
2146 fctx->count_fn, &fctx->iter);
2147 if (ret)
2148 return ret;
2149 }
2150
2151 for_each_obj_request(img_req, obj_req) {
2152 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2153 sizeof(*obj_req->bvec_pos.bvecs),
2154 GFP_NOIO);
2155 if (!obj_req->bvec_pos.bvecs)
2156 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002157 }
2158
2159 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002160 * Fill in each object request's private bio_vec array, splitting and
2161 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002162 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002163 fctx->iter = *fctx->pos;
2164 for (i = 0; i < num_img_extents; i++) {
2165 ret = ceph_iterate_extents(&rbd_dev->layout,
2166 img_extents[i].fe_off,
2167 img_extents[i].fe_len,
2168 &img_req->object_extents,
2169 fctx->copy_fn, &fctx->iter);
2170 if (ret)
2171 return ret;
2172 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002173
Ilya Dryomovafb97882018-02-06 19:26:35 +01002174 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002175}
2176
Ilya Dryomov5a237812018-02-06 19:26:34 +01002177static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2178 u64 off, u64 len)
2179{
2180 struct ceph_file_extent ex = { off, len };
2181 union rbd_img_fill_iter dummy;
2182 struct rbd_img_fill_ctx fctx = {
2183 .pos_type = OBJ_REQUEST_NODATA,
2184 .pos = &dummy,
2185 };
2186
2187 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2188}
2189
2190static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2191{
2192 struct rbd_obj_request *obj_req =
2193 container_of(ex, struct rbd_obj_request, ex);
2194 struct ceph_bio_iter *it = arg;
2195
2196 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2197 obj_req->bio_pos = *it;
2198 ceph_bio_iter_advance(it, bytes);
2199}
2200
Ilya Dryomovafb97882018-02-06 19:26:35 +01002201static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2202{
2203 struct rbd_obj_request *obj_req =
2204 container_of(ex, struct rbd_obj_request, ex);
2205 struct ceph_bio_iter *it = arg;
2206
2207 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2208 ceph_bio_iter_advance_step(it, bytes, ({
2209 obj_req->bvec_count++;
2210 }));
2211
2212}
2213
2214static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2215{
2216 struct rbd_obj_request *obj_req =
2217 container_of(ex, struct rbd_obj_request, ex);
2218 struct ceph_bio_iter *it = arg;
2219
2220 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2221 ceph_bio_iter_advance_step(it, bytes, ({
2222 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2223 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2224 }));
2225}
2226
Ilya Dryomov5a237812018-02-06 19:26:34 +01002227static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2228 struct ceph_file_extent *img_extents,
2229 u32 num_img_extents,
2230 struct ceph_bio_iter *bio_pos)
2231{
2232 struct rbd_img_fill_ctx fctx = {
2233 .pos_type = OBJ_REQUEST_BIO,
2234 .pos = (union rbd_img_fill_iter *)bio_pos,
2235 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002236 .count_fn = count_bio_bvecs,
2237 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002238 };
2239
2240 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2241 &fctx);
2242}
2243
2244static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2245 u64 off, u64 len, struct bio *bio)
2246{
2247 struct ceph_file_extent ex = { off, len };
2248 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2249
2250 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2251}
2252
2253static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2254{
2255 struct rbd_obj_request *obj_req =
2256 container_of(ex, struct rbd_obj_request, ex);
2257 struct ceph_bvec_iter *it = arg;
2258
2259 obj_req->bvec_pos = *it;
2260 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2261 ceph_bvec_iter_advance(it, bytes);
2262}
2263
Ilya Dryomovafb97882018-02-06 19:26:35 +01002264static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2265{
2266 struct rbd_obj_request *obj_req =
2267 container_of(ex, struct rbd_obj_request, ex);
2268 struct ceph_bvec_iter *it = arg;
2269
2270 ceph_bvec_iter_advance_step(it, bytes, ({
2271 obj_req->bvec_count++;
2272 }));
2273}
2274
2275static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2276{
2277 struct rbd_obj_request *obj_req =
2278 container_of(ex, struct rbd_obj_request, ex);
2279 struct ceph_bvec_iter *it = arg;
2280
2281 ceph_bvec_iter_advance_step(it, bytes, ({
2282 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2283 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2284 }));
2285}
2286
Ilya Dryomov5a237812018-02-06 19:26:34 +01002287static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2288 struct ceph_file_extent *img_extents,
2289 u32 num_img_extents,
2290 struct ceph_bvec_iter *bvec_pos)
2291{
2292 struct rbd_img_fill_ctx fctx = {
2293 .pos_type = OBJ_REQUEST_BVECS,
2294 .pos = (union rbd_img_fill_iter *)bvec_pos,
2295 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002296 .count_fn = count_bvecs,
2297 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002298 };
2299
2300 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2301 &fctx);
2302}
2303
2304static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2305 struct ceph_file_extent *img_extents,
2306 u32 num_img_extents,
2307 struct bio_vec *bvecs)
2308{
2309 struct ceph_bvec_iter it = {
2310 .bvecs = bvecs,
2311 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2312 num_img_extents) },
2313 };
2314
2315 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2316 &it);
2317}
2318
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002319static void rbd_img_request_submit(struct rbd_img_request *img_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002320{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002321 struct rbd_obj_request *obj_request;
2322
Alex Elder37206ee2013-02-20 17:32:08 -06002323 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002324
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002325 rbd_img_request_get(img_request);
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01002326 for_each_obj_request(img_request, obj_request)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002327 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002328
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002329 rbd_img_request_put(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002330}
2331
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002332static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002333{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002334 struct rbd_img_request *img_req = obj_req->img_request;
2335 struct rbd_img_request *child_img_req;
2336 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002337
Ilya Dryomove93aca02018-02-06 19:26:35 +01002338 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2339 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002340 if (!child_img_req)
2341 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002342
Ilya Dryomove93aca02018-02-06 19:26:35 +01002343 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2344 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002345
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002346 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002347 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002348 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002349 ret = __rbd_img_fill_from_bio(child_img_req,
2350 obj_req->img_extents,
2351 obj_req->num_img_extents,
2352 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002353 break;
2354 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002355 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002356 ret = __rbd_img_fill_from_bvecs(child_img_req,
2357 obj_req->img_extents,
2358 obj_req->num_img_extents,
2359 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002360 break;
2361 default:
2362 rbd_assert(0);
2363 }
2364 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002365 ret = rbd_img_fill_from_bvecs(child_img_req,
2366 obj_req->img_extents,
2367 obj_req->num_img_extents,
2368 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002369 }
2370 if (ret) {
2371 rbd_img_request_put(child_img_req);
2372 return ret;
2373 }
2374
2375 rbd_img_request_submit(child_img_req);
2376 return 0;
2377}
2378
2379static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2380{
2381 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2382 int ret;
2383
2384 if (obj_req->result == -ENOENT &&
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002385 rbd_dev->parent_overlap && !obj_req->tried_parent) {
2386 /* reverse map this object extent onto the parent */
2387 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002388 if (ret) {
2389 obj_req->result = ret;
2390 return true;
2391 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002392
2393 if (obj_req->num_img_extents) {
2394 obj_req->tried_parent = true;
2395 ret = rbd_obj_read_from_parent(obj_req);
2396 if (ret) {
2397 obj_req->result = ret;
2398 return true;
2399 }
2400 return false;
2401 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002402 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002403
2404 /*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002405 * -ENOENT means a hole in the image -- zero-fill the entire
2406 * length of the request. A short read also implies zero-fill
2407 * to the end of the request. In both cases we update xferred
2408 * count to indicate the whole request was satisfied.
Alex Elder02c74fb2013-05-06 17:40:33 -05002409 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002410 if (obj_req->result == -ENOENT ||
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002411 (!obj_req->result && obj_req->xferred < obj_req->ex.oe_len)) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002412 rbd_assert(!obj_req->xferred || !obj_req->result);
2413 rbd_obj_zero_range(obj_req, obj_req->xferred,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002414 obj_req->ex.oe_len - obj_req->xferred);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002415 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002416 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002417 }
2418
2419 return true;
2420}
2421
2422/*
2423 * copyup_bvecs pages are never highmem pages
2424 */
2425static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2426{
2427 struct ceph_bvec_iter it = {
2428 .bvecs = bvecs,
2429 .iter = { .bi_size = bytes },
2430 };
2431
2432 ceph_bvec_iter_advance_step(&it, bytes, ({
2433 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2434 bv.bv_len))
2435 return false;
2436 }));
2437 return true;
2438}
2439
2440static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2441{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002442 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
Chengguang Xufe943d52018-04-12 12:04:55 +08002443 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002444
2445 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
2446 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
2447 rbd_osd_req_destroy(obj_req->osd_req);
2448
2449 /*
2450 * Create a copyup request with the same number of OSD ops as
2451 * the original request. The original request was stat + op(s),
2452 * the new copyup request will be copyup + the same op(s).
2453 */
Ilya Dryomova162b302018-01-30 17:52:10 +01002454 obj_req->osd_req = rbd_osd_req_create(obj_req, num_osd_ops);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002455 if (!obj_req->osd_req)
2456 return -ENOMEM;
2457
Ilya Dryomov24639ce562018-09-26 19:12:07 +02002458 ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup");
Chengguang Xufe943d52018-04-12 12:04:55 +08002459 if (ret)
2460 return ret;
2461
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002462 /*
2463 * Only send non-zero copyup data to save some I/O and network
2464 * bandwidth -- zero copyup data is equivalent to the object not
2465 * existing.
2466 */
2467 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2468 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2469 bytes = 0;
2470 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002471 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
Ilya Dryomov0010f702018-05-04 16:57:30 +02002472 obj_req->copyup_bvecs,
2473 obj_req->copyup_bvec_count,
2474 bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002475
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002476 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002477 case OBJ_OP_WRITE:
2478 __rbd_obj_setup_write(obj_req, 1);
2479 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002480 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002481 rbd_assert(!rbd_obj_is_entire(obj_req));
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002482 __rbd_obj_setup_zeroout(obj_req, 1);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002483 break;
2484 default:
2485 rbd_assert(0);
2486 }
2487
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002488 ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
2489 if (ret)
2490 return ret;
2491
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002492 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002493 return 0;
2494}
2495
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002496static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2497{
2498 u32 i;
2499
2500 rbd_assert(!obj_req->copyup_bvecs);
2501 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2502 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2503 sizeof(*obj_req->copyup_bvecs),
2504 GFP_NOIO);
2505 if (!obj_req->copyup_bvecs)
2506 return -ENOMEM;
2507
2508 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2509 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2510
2511 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2512 if (!obj_req->copyup_bvecs[i].bv_page)
2513 return -ENOMEM;
2514
2515 obj_req->copyup_bvecs[i].bv_offset = 0;
2516 obj_req->copyup_bvecs[i].bv_len = len;
2517 obj_overlap -= len;
2518 }
2519
2520 rbd_assert(!obj_overlap);
2521 return 0;
2522}
2523
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002524static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2525{
2526 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002527 int ret;
2528
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002529 rbd_assert(obj_req->num_img_extents);
2530 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2531 rbd_dev->parent_overlap);
2532 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002533 /*
2534 * The overlap has become 0 (most likely because the
2535 * image has been flattened). Use rbd_obj_issue_copyup()
2536 * to re-submit the original write request -- the copyup
2537 * operation itself will be a no-op, since someone must
2538 * have populated the child object while we weren't
2539 * looking. Move to WRITE_FLAT state as we'll be done
2540 * with the operation once the null copyup completes.
2541 */
2542 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2543 return rbd_obj_issue_copyup(obj_req, 0);
2544 }
2545
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002546 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002547 if (ret)
2548 return ret;
2549
2550 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002551 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002552}
2553
2554static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
2555{
2556 int ret;
2557
2558again:
2559 switch (obj_req->write_state) {
2560 case RBD_OBJ_WRITE_GUARD:
2561 rbd_assert(!obj_req->xferred);
2562 if (obj_req->result == -ENOENT) {
2563 /*
2564 * The target object doesn't exist. Read the data for
2565 * the entire target object up to the overlap point (if
2566 * any) from the parent, so we can use it for a copyup.
2567 */
2568 ret = rbd_obj_handle_write_guard(obj_req);
2569 if (ret) {
2570 obj_req->result = ret;
2571 return true;
2572 }
2573 return false;
2574 }
2575 /* fall through */
2576 case RBD_OBJ_WRITE_FLAT:
2577 if (!obj_req->result)
2578 /*
2579 * There is no such thing as a successful short
2580 * write -- indicate the whole request was satisfied.
2581 */
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002582 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002583 return true;
2584 case RBD_OBJ_WRITE_COPYUP:
2585 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2586 if (obj_req->result)
2587 goto again;
2588
2589 rbd_assert(obj_req->xferred);
2590 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
2591 if (ret) {
2592 obj_req->result = ret;
Ilya Dryomov356889c2019-03-01 12:06:24 +01002593 obj_req->xferred = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002594 return true;
2595 }
2596 return false;
2597 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002598 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002599 }
2600}
2601
2602/*
2603 * Returns true if @obj_req is completed, or false otherwise.
2604 */
2605static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2606{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002607 switch (obj_req->img_request->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002608 case OBJ_OP_READ:
2609 return rbd_obj_handle_read(obj_req);
2610 case OBJ_OP_WRITE:
2611 return rbd_obj_handle_write(obj_req);
2612 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002613 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002614 if (rbd_obj_handle_write(obj_req)) {
2615 /*
2616 * Hide -ENOENT from delete/truncate/zero -- discarding
2617 * a non-existent object is not a problem.
2618 */
2619 if (obj_req->result == -ENOENT) {
2620 obj_req->result = 0;
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002621 obj_req->xferred = obj_req->ex.oe_len;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002622 }
2623 return true;
2624 }
2625 return false;
2626 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002627 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002628 }
2629}
2630
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002631static void rbd_obj_end_request(struct rbd_obj_request *obj_req)
2632{
2633 struct rbd_img_request *img_req = obj_req->img_request;
2634
2635 rbd_assert((!obj_req->result &&
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002636 obj_req->xferred == obj_req->ex.oe_len) ||
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002637 (obj_req->result < 0 && !obj_req->xferred));
2638 if (!obj_req->result) {
2639 img_req->xferred += obj_req->xferred;
Ilya Dryomov980917f2016-09-12 18:59:42 +02002640 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05002641 }
2642
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002643 rbd_warn(img_req->rbd_dev,
2644 "%s at objno %llu %llu~%llu result %d xferred %llu",
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002645 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2646 obj_req->ex.oe_off, obj_req->ex.oe_len, obj_req->result,
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002647 obj_req->xferred);
2648 if (!img_req->result) {
2649 img_req->result = obj_req->result;
2650 img_req->xferred = 0;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002651 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002652}
2653
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002654static void rbd_img_end_child_request(struct rbd_img_request *img_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002655{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002656 struct rbd_obj_request *obj_req = img_req->obj_request;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002657
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002658 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002659 rbd_assert((!img_req->result &&
2660 img_req->xferred == rbd_obj_img_extents_bytes(obj_req)) ||
2661 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002662
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002663 obj_req->result = img_req->result;
2664 obj_req->xferred = img_req->xferred;
2665 rbd_img_request_put(img_req);
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002666}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002667
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002668static void rbd_img_end_request(struct rbd_img_request *img_req)
2669{
2670 rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
2671 rbd_assert((!img_req->result &&
2672 img_req->xferred == blk_rq_bytes(img_req->rq)) ||
2673 (img_req->result < 0 && !img_req->xferred));
Alex Elder8b3e1a52013-01-24 16:13:36 -06002674
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002675 blk_mq_end_request(img_req->rq,
2676 errno_to_blk_status(img_req->result));
2677 rbd_img_request_put(img_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002678}
Alex Elder8b3e1a52013-01-24 16:13:36 -06002679
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002680static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
2681{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002682 struct rbd_img_request *img_req;
2683
2684again:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002685 if (!__rbd_obj_handle_request(obj_req))
2686 return;
2687
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002688 img_req = obj_req->img_request;
2689 spin_lock(&img_req->completion_lock);
2690 rbd_obj_end_request(obj_req);
2691 rbd_assert(img_req->pending_count);
2692 if (--img_req->pending_count) {
2693 spin_unlock(&img_req->completion_lock);
2694 return;
2695 }
2696
2697 spin_unlock(&img_req->completion_lock);
2698 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
2699 obj_req = img_req->obj_request;
2700 rbd_img_end_child_request(img_req);
2701 goto again;
2702 }
2703 rbd_img_end_request(img_req);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002704}
2705
Ilya Dryomoved95b212016-08-12 16:40:02 +02002706static const struct rbd_client_id rbd_empty_cid;
2707
2708static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2709 const struct rbd_client_id *rhs)
2710{
2711 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2712}
2713
2714static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2715{
2716 struct rbd_client_id cid;
2717
2718 mutex_lock(&rbd_dev->watch_mutex);
2719 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2720 cid.handle = rbd_dev->watch_cookie;
2721 mutex_unlock(&rbd_dev->watch_mutex);
2722 return cid;
2723}
2724
2725/*
2726 * lock_rwsem must be held for write
2727 */
2728static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2729 const struct rbd_client_id *cid)
2730{
2731 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2732 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2733 cid->gid, cid->handle);
2734 rbd_dev->owner_cid = *cid; /* struct */
2735}
2736
2737static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2738{
2739 mutex_lock(&rbd_dev->watch_mutex);
2740 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2741 mutex_unlock(&rbd_dev->watch_mutex);
2742}
2743
Florian Margaineedd8ca82017-12-13 16:43:59 +01002744static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2745{
2746 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2747
2748 strcpy(rbd_dev->lock_cookie, cookie);
2749 rbd_set_owner_cid(rbd_dev, &cid);
2750 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2751}
2752
Ilya Dryomoved95b212016-08-12 16:40:02 +02002753/*
2754 * lock_rwsem must be held for write
2755 */
2756static int rbd_lock(struct rbd_device *rbd_dev)
2757{
2758 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002759 char cookie[32];
2760 int ret;
2761
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002762 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2763 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002764
2765 format_lock_cookie(rbd_dev, cookie);
2766 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2767 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2768 RBD_LOCK_TAG, "", 0);
2769 if (ret)
2770 return ret;
2771
2772 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002773 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002774 return 0;
2775}
2776
2777/*
2778 * lock_rwsem must be held for write
2779 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002780static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002781{
2782 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002783 int ret;
2784
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002785 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2786 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002787
Ilya Dryomoved95b212016-08-12 16:40:02 +02002788 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002789 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002790 if (ret && ret != -ENOENT)
2791 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002792
Ilya Dryomovbbead742017-04-13 12:17:38 +02002793 /* treat errors as the image is unlocked */
2794 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002795 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02002796 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
2797 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002798}
2799
2800static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
2801 enum rbd_notify_op notify_op,
2802 struct page ***preply_pages,
2803 size_t *preply_len)
2804{
2805 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2806 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07002807 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
2808 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002809 void *p = buf;
2810
2811 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
2812
2813 /* encode *LockPayload NotifyMessage (op + ClientId) */
2814 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
2815 ceph_encode_32(&p, notify_op);
2816 ceph_encode_64(&p, cid.gid);
2817 ceph_encode_64(&p, cid.handle);
2818
2819 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
2820 &rbd_dev->header_oloc, buf, buf_size,
2821 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
2822}
2823
2824static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
2825 enum rbd_notify_op notify_op)
2826{
2827 struct page **reply_pages;
2828 size_t reply_len;
2829
2830 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
2831 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2832}
2833
2834static void rbd_notify_acquired_lock(struct work_struct *work)
2835{
2836 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2837 acquired_lock_work);
2838
2839 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
2840}
2841
2842static void rbd_notify_released_lock(struct work_struct *work)
2843{
2844 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
2845 released_lock_work);
2846
2847 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
2848}
2849
2850static int rbd_request_lock(struct rbd_device *rbd_dev)
2851{
2852 struct page **reply_pages;
2853 size_t reply_len;
2854 bool lock_owner_responded = false;
2855 int ret;
2856
2857 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2858
2859 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
2860 &reply_pages, &reply_len);
2861 if (ret && ret != -ETIMEDOUT) {
2862 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
2863 goto out;
2864 }
2865
2866 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
2867 void *p = page_address(reply_pages[0]);
2868 void *const end = p + reply_len;
2869 u32 n;
2870
2871 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
2872 while (n--) {
2873 u8 struct_v;
2874 u32 len;
2875
2876 ceph_decode_need(&p, end, 8 + 8, e_inval);
2877 p += 8 + 8; /* skip gid and cookie */
2878
2879 ceph_decode_32_safe(&p, end, len, e_inval);
2880 if (!len)
2881 continue;
2882
2883 if (lock_owner_responded) {
2884 rbd_warn(rbd_dev,
2885 "duplicate lock owners detected");
2886 ret = -EIO;
2887 goto out;
2888 }
2889
2890 lock_owner_responded = true;
2891 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
2892 &struct_v, &len);
2893 if (ret) {
2894 rbd_warn(rbd_dev,
2895 "failed to decode ResponseMessage: %d",
2896 ret);
2897 goto e_inval;
2898 }
2899
2900 ret = ceph_decode_32(&p);
2901 }
2902 }
2903
2904 if (!lock_owner_responded) {
2905 rbd_warn(rbd_dev, "no lock owners detected");
2906 ret = -ETIMEDOUT;
2907 }
2908
2909out:
2910 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
2911 return ret;
2912
2913e_inval:
2914 ret = -EINVAL;
2915 goto out;
2916}
2917
2918static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
2919{
2920 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
2921
2922 cancel_delayed_work(&rbd_dev->lock_dwork);
2923 if (wake_all)
2924 wake_up_all(&rbd_dev->lock_waitq);
2925 else
2926 wake_up(&rbd_dev->lock_waitq);
2927}
2928
2929static int get_lock_owner_info(struct rbd_device *rbd_dev,
2930 struct ceph_locker **lockers, u32 *num_lockers)
2931{
2932 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2933 u8 lock_type;
2934 char *lock_tag;
2935 int ret;
2936
2937 dout("%s rbd_dev %p\n", __func__, rbd_dev);
2938
2939 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
2940 &rbd_dev->header_oloc, RBD_LOCK_NAME,
2941 &lock_type, &lock_tag, lockers, num_lockers);
2942 if (ret)
2943 return ret;
2944
2945 if (*num_lockers == 0) {
2946 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
2947 goto out;
2948 }
2949
2950 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
2951 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
2952 lock_tag);
2953 ret = -EBUSY;
2954 goto out;
2955 }
2956
2957 if (lock_type == CEPH_CLS_LOCK_SHARED) {
2958 rbd_warn(rbd_dev, "shared lock type detected");
2959 ret = -EBUSY;
2960 goto out;
2961 }
2962
2963 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
2964 strlen(RBD_LOCK_COOKIE_PREFIX))) {
2965 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
2966 (*lockers)[0].id.cookie);
2967 ret = -EBUSY;
2968 goto out;
2969 }
2970
2971out:
2972 kfree(lock_tag);
2973 return ret;
2974}
2975
2976static int find_watcher(struct rbd_device *rbd_dev,
2977 const struct ceph_locker *locker)
2978{
2979 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
2980 struct ceph_watch_item *watchers;
2981 u32 num_watchers;
2982 u64 cookie;
2983 int i;
2984 int ret;
2985
2986 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
2987 &rbd_dev->header_oloc, &watchers,
2988 &num_watchers);
2989 if (ret)
2990 return ret;
2991
2992 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
2993 for (i = 0; i < num_watchers; i++) {
2994 if (!memcmp(&watchers[i].addr, &locker->info.addr,
2995 sizeof(locker->info.addr)) &&
2996 watchers[i].cookie == cookie) {
2997 struct rbd_client_id cid = {
2998 .gid = le64_to_cpu(watchers[i].name.num),
2999 .handle = cookie,
3000 };
3001
3002 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3003 rbd_dev, cid.gid, cid.handle);
3004 rbd_set_owner_cid(rbd_dev, &cid);
3005 ret = 1;
3006 goto out;
3007 }
3008 }
3009
3010 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3011 ret = 0;
3012out:
3013 kfree(watchers);
3014 return ret;
3015}
3016
3017/*
3018 * lock_rwsem must be held for write
3019 */
3020static int rbd_try_lock(struct rbd_device *rbd_dev)
3021{
3022 struct ceph_client *client = rbd_dev->rbd_client->client;
3023 struct ceph_locker *lockers;
3024 u32 num_lockers;
3025 int ret;
3026
3027 for (;;) {
3028 ret = rbd_lock(rbd_dev);
3029 if (ret != -EBUSY)
3030 return ret;
3031
3032 /* determine if the current lock holder is still alive */
3033 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3034 if (ret)
3035 return ret;
3036
3037 if (num_lockers == 0)
3038 goto again;
3039
3040 ret = find_watcher(rbd_dev, lockers);
3041 if (ret) {
3042 if (ret > 0)
3043 ret = 0; /* have to request lock */
3044 goto out;
3045 }
3046
3047 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3048 ENTITY_NAME(lockers[0].id.name));
3049
3050 ret = ceph_monc_blacklist_add(&client->monc,
3051 &lockers[0].info.addr);
3052 if (ret) {
3053 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3054 ENTITY_NAME(lockers[0].id.name), ret);
3055 goto out;
3056 }
3057
3058 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3059 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3060 lockers[0].id.cookie,
3061 &lockers[0].id.name);
3062 if (ret && ret != -ENOENT)
3063 goto out;
3064
3065again:
3066 ceph_free_lockers(lockers, num_lockers);
3067 }
3068
3069out:
3070 ceph_free_lockers(lockers, num_lockers);
3071 return ret;
3072}
3073
3074/*
3075 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3076 */
3077static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3078 int *pret)
3079{
3080 enum rbd_lock_state lock_state;
3081
3082 down_read(&rbd_dev->lock_rwsem);
3083 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3084 rbd_dev->lock_state);
3085 if (__rbd_is_lock_owner(rbd_dev)) {
3086 lock_state = rbd_dev->lock_state;
3087 up_read(&rbd_dev->lock_rwsem);
3088 return lock_state;
3089 }
3090
3091 up_read(&rbd_dev->lock_rwsem);
3092 down_write(&rbd_dev->lock_rwsem);
3093 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3094 rbd_dev->lock_state);
3095 if (!__rbd_is_lock_owner(rbd_dev)) {
3096 *pret = rbd_try_lock(rbd_dev);
3097 if (*pret)
3098 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3099 }
3100
3101 lock_state = rbd_dev->lock_state;
3102 up_write(&rbd_dev->lock_rwsem);
3103 return lock_state;
3104}
3105
3106static void rbd_acquire_lock(struct work_struct *work)
3107{
3108 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3109 struct rbd_device, lock_dwork);
3110 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003111 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003112
3113 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3114again:
3115 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3116 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3117 if (lock_state == RBD_LOCK_STATE_LOCKED)
3118 wake_requests(rbd_dev, true);
3119 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3120 rbd_dev, lock_state, ret);
3121 return;
3122 }
3123
3124 ret = rbd_request_lock(rbd_dev);
3125 if (ret == -ETIMEDOUT) {
3126 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003127 } else if (ret == -EROFS) {
3128 rbd_warn(rbd_dev, "peer will not release lock");
3129 /*
3130 * If this is rbd_add_acquire_lock(), we want to fail
3131 * immediately -- reuse BLACKLISTED flag. Otherwise we
3132 * want to block.
3133 */
3134 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3135 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3136 /* wake "rbd map --exclusive" process */
3137 wake_requests(rbd_dev, false);
3138 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003139 } else if (ret < 0) {
3140 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3141 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3142 RBD_RETRY_DELAY);
3143 } else {
3144 /*
3145 * lock owner acked, but resend if we don't see them
3146 * release the lock
3147 */
3148 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3149 rbd_dev);
3150 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3151 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3152 }
3153}
3154
3155/*
3156 * lock_rwsem must be held for write
3157 */
3158static bool rbd_release_lock(struct rbd_device *rbd_dev)
3159{
3160 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3161 rbd_dev->lock_state);
3162 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3163 return false;
3164
3165 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3166 downgrade_write(&rbd_dev->lock_rwsem);
3167 /*
3168 * Ensure that all in-flight IO is flushed.
3169 *
3170 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3171 * may be shared with other devices.
3172 */
3173 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3174 up_read(&rbd_dev->lock_rwsem);
3175
3176 down_write(&rbd_dev->lock_rwsem);
3177 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3178 rbd_dev->lock_state);
3179 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3180 return false;
3181
Ilya Dryomovbbead742017-04-13 12:17:38 +02003182 rbd_unlock(rbd_dev);
3183 /*
3184 * Give others a chance to grab the lock - we would re-acquire
3185 * almost immediately if we got new IO during ceph_osdc_sync()
3186 * otherwise. We need to ack our own notifications, so this
3187 * lock_dwork will be requeued from rbd_wait_state_locked()
3188 * after wake_requests() in rbd_handle_released_lock().
3189 */
3190 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003191 return true;
3192}
3193
3194static void rbd_release_lock_work(struct work_struct *work)
3195{
3196 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3197 unlock_work);
3198
3199 down_write(&rbd_dev->lock_rwsem);
3200 rbd_release_lock(rbd_dev);
3201 up_write(&rbd_dev->lock_rwsem);
3202}
3203
3204static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3205 void **p)
3206{
3207 struct rbd_client_id cid = { 0 };
3208
3209 if (struct_v >= 2) {
3210 cid.gid = ceph_decode_64(p);
3211 cid.handle = ceph_decode_64(p);
3212 }
3213
3214 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3215 cid.handle);
3216 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3217 down_write(&rbd_dev->lock_rwsem);
3218 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3219 /*
3220 * we already know that the remote client is
3221 * the owner
3222 */
3223 up_write(&rbd_dev->lock_rwsem);
3224 return;
3225 }
3226
3227 rbd_set_owner_cid(rbd_dev, &cid);
3228 downgrade_write(&rbd_dev->lock_rwsem);
3229 } else {
3230 down_read(&rbd_dev->lock_rwsem);
3231 }
3232
3233 if (!__rbd_is_lock_owner(rbd_dev))
3234 wake_requests(rbd_dev, false);
3235 up_read(&rbd_dev->lock_rwsem);
3236}
3237
3238static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3239 void **p)
3240{
3241 struct rbd_client_id cid = { 0 };
3242
3243 if (struct_v >= 2) {
3244 cid.gid = ceph_decode_64(p);
3245 cid.handle = ceph_decode_64(p);
3246 }
3247
3248 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3249 cid.handle);
3250 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3251 down_write(&rbd_dev->lock_rwsem);
3252 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3253 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3254 __func__, rbd_dev, cid.gid, cid.handle,
3255 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3256 up_write(&rbd_dev->lock_rwsem);
3257 return;
3258 }
3259
3260 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3261 downgrade_write(&rbd_dev->lock_rwsem);
3262 } else {
3263 down_read(&rbd_dev->lock_rwsem);
3264 }
3265
3266 if (!__rbd_is_lock_owner(rbd_dev))
3267 wake_requests(rbd_dev, false);
3268 up_read(&rbd_dev->lock_rwsem);
3269}
3270
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003271/*
3272 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3273 * ResponseMessage is needed.
3274 */
3275static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3276 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003277{
3278 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3279 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003280 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003281
3282 if (struct_v >= 2) {
3283 cid.gid = ceph_decode_64(p);
3284 cid.handle = ceph_decode_64(p);
3285 }
3286
3287 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3288 cid.handle);
3289 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003290 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003291
3292 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003293 if (__rbd_is_lock_owner(rbd_dev)) {
3294 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3295 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3296 goto out_unlock;
3297
3298 /*
3299 * encode ResponseMessage(0) so the peer can detect
3300 * a missing owner
3301 */
3302 result = 0;
3303
3304 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003305 if (!rbd_dev->opts->exclusive) {
3306 dout("%s rbd_dev %p queueing unlock_work\n",
3307 __func__, rbd_dev);
3308 queue_work(rbd_dev->task_wq,
3309 &rbd_dev->unlock_work);
3310 } else {
3311 /* refuse to release the lock */
3312 result = -EROFS;
3313 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003314 }
3315 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003316
3317out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003318 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003319 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003320}
3321
3322static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3323 u64 notify_id, u64 cookie, s32 *result)
3324{
3325 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003326 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3327 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003328 int ret;
3329
3330 if (result) {
3331 void *p = buf;
3332
3333 /* encode ResponseMessage */
3334 ceph_start_encoding(&p, 1, 1,
3335 buf_size - CEPH_ENCODING_START_BLK_LEN);
3336 ceph_encode_32(&p, *result);
3337 } else {
3338 buf_size = 0;
3339 }
3340
3341 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3342 &rbd_dev->header_oloc, notify_id, cookie,
3343 buf, buf_size);
3344 if (ret)
3345 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3346}
3347
3348static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3349 u64 cookie)
3350{
3351 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3352 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3353}
3354
3355static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3356 u64 notify_id, u64 cookie, s32 result)
3357{
3358 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3359 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3360}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003361
3362static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3363 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003364{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003365 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003366 void *p = data;
3367 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003368 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003369 u32 len;
3370 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003371 int ret;
3372
Ilya Dryomoved95b212016-08-12 16:40:02 +02003373 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3374 __func__, rbd_dev, cookie, notify_id, data_len);
3375 if (data_len) {
3376 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3377 &struct_v, &len);
3378 if (ret) {
3379 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3380 ret);
3381 return;
3382 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003383
Ilya Dryomoved95b212016-08-12 16:40:02 +02003384 notify_op = ceph_decode_32(&p);
3385 } else {
3386 /* legacy notification for header updates */
3387 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3388 len = 0;
3389 }
Alex Elderb8d70032012-11-30 17:53:04 -06003390
Ilya Dryomoved95b212016-08-12 16:40:02 +02003391 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3392 switch (notify_op) {
3393 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3394 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3395 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3396 break;
3397 case RBD_NOTIFY_OP_RELEASED_LOCK:
3398 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3399 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3400 break;
3401 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003402 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3403 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003404 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003405 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003406 else
3407 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3408 break;
3409 case RBD_NOTIFY_OP_HEADER_UPDATE:
3410 ret = rbd_dev_refresh(rbd_dev);
3411 if (ret)
3412 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3413
3414 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3415 break;
3416 default:
3417 if (rbd_is_lock_owner(rbd_dev))
3418 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3419 cookie, -EOPNOTSUPP);
3420 else
3421 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3422 break;
3423 }
Alex Elderb8d70032012-11-30 17:53:04 -06003424}
3425
Ilya Dryomov99d16942016-08-12 16:11:41 +02003426static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3427
Ilya Dryomov922dab62016-05-26 01:15:02 +02003428static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003429{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003430 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003431
Ilya Dryomov922dab62016-05-26 01:15:02 +02003432 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003433
Ilya Dryomoved95b212016-08-12 16:40:02 +02003434 down_write(&rbd_dev->lock_rwsem);
3435 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3436 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003437
Ilya Dryomov99d16942016-08-12 16:11:41 +02003438 mutex_lock(&rbd_dev->watch_mutex);
3439 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3440 __rbd_unregister_watch(rbd_dev);
3441 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003442
Ilya Dryomov99d16942016-08-12 16:11:41 +02003443 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003444 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003445 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003446}
3447
3448/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003449 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003450 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003451static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003452{
3453 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003454 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003455
Ilya Dryomov922dab62016-05-26 01:15:02 +02003456 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003457 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003458
Ilya Dryomov922dab62016-05-26 01:15:02 +02003459 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3460 &rbd_dev->header_oloc, rbd_watch_cb,
3461 rbd_watch_errcb, rbd_dev);
3462 if (IS_ERR(handle))
3463 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003464
Ilya Dryomov922dab62016-05-26 01:15:02 +02003465 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003466 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003467}
3468
Ilya Dryomov99d16942016-08-12 16:11:41 +02003469/*
3470 * watch_mutex must be locked
3471 */
3472static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003473{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003474 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3475 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003476
Ilya Dryomov99d16942016-08-12 16:11:41 +02003477 rbd_assert(rbd_dev->watch_handle);
3478 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003479
Ilya Dryomov922dab62016-05-26 01:15:02 +02003480 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3481 if (ret)
3482 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003483
Ilya Dryomov922dab62016-05-26 01:15:02 +02003484 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003485}
3486
Ilya Dryomov99d16942016-08-12 16:11:41 +02003487static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003488{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003489 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003490
Ilya Dryomov99d16942016-08-12 16:11:41 +02003491 mutex_lock(&rbd_dev->watch_mutex);
3492 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3493 ret = __rbd_register_watch(rbd_dev);
3494 if (ret)
3495 goto out;
3496
3497 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3498 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3499
3500out:
3501 mutex_unlock(&rbd_dev->watch_mutex);
3502 return ret;
3503}
3504
3505static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3506{
3507 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3508
Ilya Dryomoved95b212016-08-12 16:40:02 +02003509 cancel_work_sync(&rbd_dev->acquired_lock_work);
3510 cancel_work_sync(&rbd_dev->released_lock_work);
3511 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3512 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003513}
3514
3515static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3516{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003517 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003518 cancel_tasks_sync(rbd_dev);
3519
3520 mutex_lock(&rbd_dev->watch_mutex);
3521 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3522 __rbd_unregister_watch(rbd_dev);
3523 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3524 mutex_unlock(&rbd_dev->watch_mutex);
3525
Dongsheng Yang23edca82018-06-04 06:24:37 -04003526 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003527 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003528}
3529
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003530/*
3531 * lock_rwsem must be held for write
3532 */
3533static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3534{
3535 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3536 char cookie[32];
3537 int ret;
3538
3539 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3540
3541 format_lock_cookie(rbd_dev, cookie);
3542 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3543 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3544 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3545 RBD_LOCK_TAG, cookie);
3546 if (ret) {
3547 if (ret != -EOPNOTSUPP)
3548 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3549 ret);
3550
3551 /*
3552 * Lock cookie cannot be updated on older OSDs, so do
3553 * a manual release and queue an acquire.
3554 */
3555 if (rbd_release_lock(rbd_dev))
3556 queue_delayed_work(rbd_dev->task_wq,
3557 &rbd_dev->lock_dwork, 0);
3558 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003559 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003560 }
3561}
3562
Ilya Dryomov99d16942016-08-12 16:11:41 +02003563static void rbd_reregister_watch(struct work_struct *work)
3564{
3565 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3566 struct rbd_device, watch_dwork);
3567 int ret;
3568
3569 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3570
3571 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003572 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3573 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003574 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003575 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003576
3577 ret = __rbd_register_watch(rbd_dev);
3578 if (ret) {
3579 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003580 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003581 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003582 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003583 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003584 queue_delayed_work(rbd_dev->task_wq,
3585 &rbd_dev->watch_dwork,
3586 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003587 }
3588 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003589 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003590 }
3591
3592 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3593 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3594 mutex_unlock(&rbd_dev->watch_mutex);
3595
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003596 down_write(&rbd_dev->lock_rwsem);
3597 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3598 rbd_reacquire_lock(rbd_dev);
3599 up_write(&rbd_dev->lock_rwsem);
3600
Ilya Dryomov99d16942016-08-12 16:11:41 +02003601 ret = rbd_dev_refresh(rbd_dev);
3602 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003603 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003604}
3605
Alex Elder36be9a72013-01-19 00:30:28 -06003606/*
Alex Elderf40eb342013-04-25 15:09:42 -05003607 * Synchronous osd object method call. Returns the number of bytes
3608 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003609 */
3610static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003611 struct ceph_object_id *oid,
3612 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003613 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003614 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003615 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003616 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003617 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003618{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003619 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3620 struct page *req_page = NULL;
3621 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003622 int ret;
3623
3624 /*
Alex Elder6010a452013-04-05 01:27:11 -05003625 * Method calls are ultimately read operations. The result
3626 * should placed into the inbound buffer provided. They
3627 * also supply outbound data--parameters for the object
3628 * method. Currently if this is present it will be a
3629 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003630 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003631 if (outbound) {
3632 if (outbound_size > PAGE_SIZE)
3633 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003634
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003635 req_page = alloc_page(GFP_KERNEL);
3636 if (!req_page)
3637 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003638
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003639 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003640 }
Alex Elder430c28c2013-04-03 21:32:51 -05003641
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003642 reply_page = alloc_page(GFP_KERNEL);
3643 if (!reply_page) {
3644 if (req_page)
3645 __free_page(req_page);
3646 return -ENOMEM;
3647 }
Alex Elder36be9a72013-01-19 00:30:28 -06003648
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003649 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3650 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3651 reply_page, &inbound_size);
3652 if (!ret) {
3653 memcpy(inbound, page_address(reply_page), inbound_size);
3654 ret = inbound_size;
3655 }
Alex Elder57385b52013-04-21 12:14:45 -05003656
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003657 if (req_page)
3658 __free_page(req_page);
3659 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003660 return ret;
3661}
3662
Ilya Dryomoved95b212016-08-12 16:40:02 +02003663/*
3664 * lock_rwsem must be held for read
3665 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003666static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003667{
3668 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003669 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003670 int ret = 0;
3671
3672 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3673 return -EBLACKLISTED;
3674
3675 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3676 return 0;
3677
3678 if (!may_acquire) {
3679 rbd_warn(rbd_dev, "exclusive lock required");
3680 return -EROFS;
3681 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003682
3683 do {
3684 /*
3685 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3686 * and cancel_delayed_work() in wake_requests().
3687 */
3688 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3689 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3690 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3691 TASK_UNINTERRUPTIBLE);
3692 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003693 timeout = schedule_timeout(ceph_timeout_jiffies(
3694 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02003695 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003696 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3697 ret = -EBLACKLISTED;
3698 break;
3699 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003700 if (!timeout) {
3701 rbd_warn(rbd_dev, "timed out waiting for lock");
3702 ret = -ETIMEDOUT;
3703 break;
3704 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003705 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003706
Ilya Dryomoved95b212016-08-12 16:40:02 +02003707 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003708 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003709}
3710
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003711static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003712{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003713 struct request *rq = blk_mq_rq_from_pdu(work);
3714 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003715 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003716 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003717 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3718 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003719 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003720 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003721 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003722 int result;
3723
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003724 switch (req_op(rq)) {
3725 case REQ_OP_DISCARD:
3726 op_type = OBJ_OP_DISCARD;
3727 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003728 case REQ_OP_WRITE_ZEROES:
3729 op_type = OBJ_OP_ZEROOUT;
3730 break;
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003731 case REQ_OP_WRITE:
3732 op_type = OBJ_OP_WRITE;
3733 break;
3734 case REQ_OP_READ:
3735 op_type = OBJ_OP_READ;
3736 break;
3737 default:
3738 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003739 result = -EIO;
3740 goto err;
3741 }
3742
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003743 /* Ignore/skip any zero-length requests */
3744
3745 if (!length) {
3746 dout("%s: zero-length request\n", __func__);
3747 result = 0;
3748 goto err_rq;
3749 }
3750
Ilya Dryomov9568c932017-10-12 12:35:19 +02003751 rbd_assert(op_type == OBJ_OP_READ ||
3752 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003753
3754 /*
3755 * Quit early if the mapped snapshot no longer exists. It's
3756 * still possible the snapshot will have disappeared by the
3757 * time our request arrives at the osd, but there's no sense in
3758 * sending it if we already know.
3759 */
3760 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3761 dout("request for non-existent snapshot");
3762 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3763 result = -ENXIO;
3764 goto err_rq;
3765 }
3766
3767 if (offset && length > U64_MAX - offset + 1) {
3768 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3769 length);
3770 result = -EINVAL;
3771 goto err_rq; /* Shouldn't happen */
3772 }
3773
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003774 blk_mq_start_request(rq);
3775
Josh Durgin4e752f02014-04-08 11:12:11 -07003776 down_read(&rbd_dev->header_rwsem);
3777 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003778 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003779 snapc = rbd_dev->header.snapc;
3780 ceph_get_snap_context(snapc);
3781 }
3782 up_read(&rbd_dev->header_rwsem);
3783
3784 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003785 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003786 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003787 result = -EIO;
3788 goto err_rq;
3789 }
3790
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02003791 must_be_locked =
3792 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
3793 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003794 if (must_be_locked) {
3795 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003796 result = rbd_wait_state_locked(rbd_dev,
3797 !rbd_dev->opts->exclusive);
3798 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003799 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003800 }
3801
Ilya Dryomovdfd98752018-02-06 19:26:35 +01003802 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003803 if (!img_request) {
3804 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003805 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003806 }
3807 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01003808 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003809
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003810 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01003811 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08003812 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01003813 result = rbd_img_fill_from_bio(img_request, offset, length,
3814 rq->bio);
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01003815 if (result || !img_request->pending_count)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003816 goto err_img_request;
3817
Ilya Dryomovefbd1a12018-01-30 17:52:11 +01003818 rbd_img_request_submit(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003819 if (must_be_locked)
3820 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003821 return;
3822
3823err_img_request:
3824 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003825err_unlock:
3826 if (must_be_locked)
3827 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003828err_rq:
3829 if (result)
3830 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003831 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01003832 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003833err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02003834 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003835}
3836
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003837static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003838 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003839{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003840 struct request *rq = bd->rq;
3841 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003842
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003843 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02003844 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06003845}
3846
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003847static void rbd_free_disk(struct rbd_device *rbd_dev)
3848{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02003849 blk_cleanup_queue(rbd_dev->disk->queue);
3850 blk_mq_free_tag_set(&rbd_dev->tag_set);
3851 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05003852 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003853}
3854
Alex Elder788e2df2013-01-17 12:25:27 -06003855static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003856 struct ceph_object_id *oid,
3857 struct ceph_object_locator *oloc,
3858 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06003859
3860{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003861 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3862 struct ceph_osd_request *req;
3863 struct page **pages;
3864 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06003865 int ret;
3866
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003867 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
3868 if (!req)
3869 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06003870
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003871 ceph_oid_copy(&req->r_base_oid, oid);
3872 ceph_oloc_copy(&req->r_base_oloc, oloc);
3873 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06003874
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003875 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
3876 if (IS_ERR(pages)) {
3877 ret = PTR_ERR(pages);
3878 goto out_req;
3879 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06003880
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003881 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
3882 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
3883 true);
Alex Elder788e2df2013-01-17 12:25:27 -06003884
Ilya Dryomov26f887e2018-10-15 16:11:37 +02003885 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
3886 if (ret)
3887 goto out_req;
3888
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003889 ceph_osdc_start_request(osdc, req, false);
3890 ret = ceph_osdc_wait_request(osdc, req);
3891 if (ret >= 0)
3892 ceph_copy_from_page_vector(pages, buf, 0, ret);
3893
3894out_req:
3895 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06003896 return ret;
3897}
3898
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003899/*
Alex Elder662518b2013-05-06 09:51:29 -05003900 * Read the complete header for the given rbd device. On successful
3901 * return, the rbd_dev->header field will contain up-to-date
3902 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05003903 */
Alex Elder99a41eb2013-05-06 09:51:30 -05003904static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05003905{
3906 struct rbd_image_header_ondisk *ondisk = NULL;
3907 u32 snap_count = 0;
3908 u64 names_size = 0;
3909 u32 want_count;
3910 int ret;
3911
3912 /*
3913 * The complete header will include an array of its 64-bit
3914 * snapshot ids, followed by the names of those snapshots as
3915 * a contiguous block of NUL-terminated strings. Note that
3916 * the number of snapshots could change by the time we read
3917 * it in, in which case we re-read it.
3918 */
3919 do {
3920 size_t size;
3921
3922 kfree(ondisk);
3923
3924 size = sizeof (*ondisk);
3925 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
3926 size += names_size;
3927 ondisk = kmalloc(size, GFP_KERNEL);
3928 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05003929 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05003930
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01003931 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
3932 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05003933 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05003934 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05003935 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05003936 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003937 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
3938 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05003939 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003940 }
3941 if (!rbd_dev_ondisk_valid(ondisk)) {
3942 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05003943 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05003944 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05003945 }
3946
3947 names_size = le64_to_cpu(ondisk->snap_names_len);
3948 want_count = snap_count;
3949 snap_count = le32_to_cpu(ondisk->snap_count);
3950 } while (snap_count != want_count);
3951
Alex Elder662518b2013-05-06 09:51:29 -05003952 ret = rbd_header_from_disk(rbd_dev, ondisk);
3953out:
Alex Elder4156d992012-08-02 11:29:46 -05003954 kfree(ondisk);
3955
Yehuda Sadehdfc56062010-11-19 14:51:04 -08003956 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07003957}
3958
Alex Elder15228ed2013-05-01 12:43:03 -05003959/*
3960 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
3961 * has disappeared from the (just updated) snapshot context.
3962 */
3963static void rbd_exists_validate(struct rbd_device *rbd_dev)
3964{
3965 u64 snap_id;
3966
3967 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
3968 return;
3969
3970 snap_id = rbd_dev->spec->snap_id;
3971 if (snap_id == CEPH_NOSNAP)
3972 return;
3973
3974 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
3975 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
3976}
3977
Josh Durgin98752012013-08-29 17:26:31 -07003978static void rbd_dev_update_size(struct rbd_device *rbd_dev)
3979{
3980 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07003981
3982 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02003983 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
3984 * try to update its size. If REMOVING is set, updating size
3985 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07003986 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02003987 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
3988 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07003989 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
3990 dout("setting size to %llu sectors", (unsigned long long)size);
3991 set_capacity(rbd_dev->disk, size);
3992 revalidate_disk(rbd_dev->disk);
3993 }
3994}
3995
Alex Eldercc4a38bd2013-04-30 00:44:33 -05003996static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05003997{
Alex Eldere627db02013-05-06 07:40:30 -05003998 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05003999 int ret;
4000
Alex Eldercfbf6372013-05-31 17:40:45 -05004001 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004002 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004003
4004 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004005 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004006 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004007
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004008 /*
4009 * If there is a parent, see if it has disappeared due to the
4010 * mapped image getting flattened.
4011 */
4012 if (rbd_dev->parent) {
4013 ret = rbd_dev_v2_parent_info(rbd_dev);
4014 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004015 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004016 }
4017
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004018 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004019 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004020 } else {
4021 /* validate mapped snapshot's EXISTS flag */
4022 rbd_exists_validate(rbd_dev);
4023 }
Alex Elder15228ed2013-05-01 12:43:03 -05004024
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004025out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004026 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004027 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004028 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004029
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004030 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004031}
4032
Christoph Hellwigd6296d392017-05-01 10:19:08 -06004033static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4034 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004035{
4036 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4037
4038 INIT_WORK(work, rbd_queue_workfn);
4039 return 0;
4040}
4041
Eric Biggersf363b082017-03-30 13:39:16 -07004042static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004043 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004044 .init_request = rbd_init_request,
4045};
4046
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004047static int rbd_init_disk(struct rbd_device *rbd_dev)
4048{
4049 struct gendisk *disk;
4050 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004051 unsigned int objset_bytes =
4052 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004053 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004054
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004055 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004056 disk = alloc_disk(single_major ?
4057 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4058 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004059 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004060 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004061
Alex Elderf0f8cef2012-01-29 13:57:44 -06004062 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004063 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004064 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004065 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004066 if (single_major)
4067 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004068 disk->fops = &rbd_bd_ops;
4069 disk->private_data = rbd_dev;
4070
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004071 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4072 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004073 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004074 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004075 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004076 rbd_dev->tag_set.nr_hw_queues = 1;
4077 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4078
4079 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4080 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004081 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004082
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004083 q = blk_mq_init_queue(&rbd_dev->tag_set);
4084 if (IS_ERR(q)) {
4085 err = PTR_ERR(q);
4086 goto out_tag_set;
4087 }
4088
Bart Van Assche8b904b52018-03-07 17:10:10 -08004089 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004090 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004091
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004092 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004093 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004094 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004095 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004096 blk_queue_io_min(q, objset_bytes);
4097 blk_queue_io_opt(q, objset_bytes);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004098
Ilya Dryomovd9360542018-03-23 06:14:47 +01004099 if (rbd_dev->opts->trim) {
4100 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
4101 q->limits.discard_granularity = objset_bytes;
4102 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4103 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4104 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004105
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004106 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004107 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004108
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004109 /*
4110 * disk_release() expects a queue ref from add_disk() and will
4111 * put it. Hold an extra ref until add_disk() is called.
4112 */
4113 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004114 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004115 q->queuedata = rbd_dev;
4116
4117 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004118
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004119 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004120out_tag_set:
4121 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004122out_disk:
4123 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004124 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004125}
4126
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004127/*
4128 sysfs
4129*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004130
Alex Elder593a9e72012-02-07 12:03:37 -06004131static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4132{
4133 return container_of(dev, struct rbd_device, dev);
4134}
4135
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004136static ssize_t rbd_size_show(struct device *dev,
4137 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004138{
Alex Elder593a9e72012-02-07 12:03:37 -06004139 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004140
Alex Elderfc71d832013-04-26 15:44:36 -05004141 return sprintf(buf, "%llu\n",
4142 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004143}
4144
Alex Elder34b13182012-07-13 20:35:12 -05004145/*
4146 * Note this shows the features for whatever's mapped, which is not
4147 * necessarily the base image.
4148 */
4149static ssize_t rbd_features_show(struct device *dev,
4150 struct device_attribute *attr, char *buf)
4151{
4152 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4153
4154 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004155 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004156}
4157
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004158static ssize_t rbd_major_show(struct device *dev,
4159 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004160{
Alex Elder593a9e72012-02-07 12:03:37 -06004161 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004162
Alex Elderfc71d832013-04-26 15:44:36 -05004163 if (rbd_dev->major)
4164 return sprintf(buf, "%d\n", rbd_dev->major);
4165
4166 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004167}
Alex Elderfc71d832013-04-26 15:44:36 -05004168
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004169static ssize_t rbd_minor_show(struct device *dev,
4170 struct device_attribute *attr, char *buf)
4171{
4172 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4173
4174 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004175}
4176
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004177static ssize_t rbd_client_addr_show(struct device *dev,
4178 struct device_attribute *attr, char *buf)
4179{
4180 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4181 struct ceph_entity_addr *client_addr =
4182 ceph_client_addr(rbd_dev->rbd_client->client);
4183
4184 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4185 le32_to_cpu(client_addr->nonce));
4186}
4187
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004188static ssize_t rbd_client_id_show(struct device *dev,
4189 struct device_attribute *attr, char *buf)
4190{
Alex Elder593a9e72012-02-07 12:03:37 -06004191 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004192
Alex Elder1dbb4392012-01-24 10:08:37 -06004193 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004194 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004195}
4196
Mike Christie267fb902016-08-18 18:38:43 +02004197static ssize_t rbd_cluster_fsid_show(struct device *dev,
4198 struct device_attribute *attr, char *buf)
4199{
4200 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4201
4202 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4203}
4204
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004205static ssize_t rbd_config_info_show(struct device *dev,
4206 struct device_attribute *attr, char *buf)
4207{
4208 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4209
4210 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004211}
4212
4213static ssize_t rbd_pool_show(struct device *dev,
4214 struct device_attribute *attr, char *buf)
4215{
Alex Elder593a9e72012-02-07 12:03:37 -06004216 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004217
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004218 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004219}
4220
Alex Elder9bb2f332012-07-12 10:46:35 -05004221static ssize_t rbd_pool_id_show(struct device *dev,
4222 struct device_attribute *attr, char *buf)
4223{
4224 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4225
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004226 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004227 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004228}
4229
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004230static ssize_t rbd_pool_ns_show(struct device *dev,
4231 struct device_attribute *attr, char *buf)
4232{
4233 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4234
4235 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4236}
4237
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004238static ssize_t rbd_name_show(struct device *dev,
4239 struct device_attribute *attr, char *buf)
4240{
Alex Elder593a9e72012-02-07 12:03:37 -06004241 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004242
Alex Eldera92ffdf2012-10-30 19:40:33 -05004243 if (rbd_dev->spec->image_name)
4244 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4245
4246 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004247}
4248
Alex Elder589d30e2012-07-10 20:30:11 -05004249static ssize_t rbd_image_id_show(struct device *dev,
4250 struct device_attribute *attr, char *buf)
4251{
4252 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4253
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004254 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004255}
4256
Alex Elder34b13182012-07-13 20:35:12 -05004257/*
4258 * Shows the name of the currently-mapped snapshot (or
4259 * RBD_SNAP_HEAD_NAME for the base image).
4260 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004261static ssize_t rbd_snap_show(struct device *dev,
4262 struct device_attribute *attr,
4263 char *buf)
4264{
Alex Elder593a9e72012-02-07 12:03:37 -06004265 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004266
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004267 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004268}
4269
Mike Christie92a58672016-08-18 18:38:44 +02004270static ssize_t rbd_snap_id_show(struct device *dev,
4271 struct device_attribute *attr, char *buf)
4272{
4273 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4274
4275 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4276}
4277
Alex Elder86b00e02012-10-25 23:34:42 -05004278/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004279 * For a v2 image, shows the chain of parent images, separated by empty
4280 * lines. For v1 images or if there is no parent, shows "(no parent
4281 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004282 */
4283static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004284 struct device_attribute *attr,
4285 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004286{
4287 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004288 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004289
Ilya Dryomovff961282014-07-22 21:53:07 +04004290 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004291 return sprintf(buf, "(no parent image)\n");
4292
Ilya Dryomovff961282014-07-22 21:53:07 +04004293 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4294 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004295
Ilya Dryomovff961282014-07-22 21:53:07 +04004296 count += sprintf(&buf[count], "%s"
4297 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004298 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04004299 "image_id %s\nimage_name %s\n"
4300 "snap_id %llu\nsnap_name %s\n"
4301 "overlap %llu\n",
4302 !count ? "" : "\n", /* first? */
4303 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004304 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04004305 spec->image_id, spec->image_name ?: "(unknown)",
4306 spec->snap_id, spec->snap_name,
4307 rbd_dev->parent_overlap);
4308 }
Alex Elder86b00e02012-10-25 23:34:42 -05004309
Ilya Dryomovff961282014-07-22 21:53:07 +04004310 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004311}
4312
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004313static ssize_t rbd_image_refresh(struct device *dev,
4314 struct device_attribute *attr,
4315 const char *buf,
4316 size_t size)
4317{
Alex Elder593a9e72012-02-07 12:03:37 -06004318 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004319 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004320
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004321 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004322 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004323 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004324
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004325 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004326}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004327
Joe Perches5657a812018-05-24 13:38:59 -06004328static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4329static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4330static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4331static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4332static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4333static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4334static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4335static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4336static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4337static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004338static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004339static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4340static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4341static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4342static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4343static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4344static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004345
4346static struct attribute *rbd_attrs[] = {
4347 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004348 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004349 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004350 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004351 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004352 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004353 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004354 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004355 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004356 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004357 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004358 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004359 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004360 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004361 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004362 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004363 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004364 NULL
4365};
4366
4367static struct attribute_group rbd_attr_group = {
4368 .attrs = rbd_attrs,
4369};
4370
4371static const struct attribute_group *rbd_attr_groups[] = {
4372 &rbd_attr_group,
4373 NULL
4374};
4375
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004376static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004377
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304378static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004379 .name = "rbd",
4380 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004381 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004382};
4383
Alex Elder8b8fb992012-10-26 17:25:24 -05004384static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4385{
4386 kref_get(&spec->kref);
4387
4388 return spec;
4389}
4390
4391static void rbd_spec_free(struct kref *kref);
4392static void rbd_spec_put(struct rbd_spec *spec)
4393{
4394 if (spec)
4395 kref_put(&spec->kref, rbd_spec_free);
4396}
4397
4398static struct rbd_spec *rbd_spec_alloc(void)
4399{
4400 struct rbd_spec *spec;
4401
4402 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4403 if (!spec)
4404 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004405
4406 spec->pool_id = CEPH_NOPOOL;
4407 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004408 kref_init(&spec->kref);
4409
Alex Elder8b8fb992012-10-26 17:25:24 -05004410 return spec;
4411}
4412
4413static void rbd_spec_free(struct kref *kref)
4414{
4415 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4416
4417 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004418 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004419 kfree(spec->image_id);
4420 kfree(spec->image_name);
4421 kfree(spec->snap_name);
4422 kfree(spec);
4423}
4424
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004425static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004426{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004427 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004428 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004429
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004430 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004431 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004432 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004433
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004434 rbd_put_client(rbd_dev->rbd_client);
4435 rbd_spec_put(rbd_dev->spec);
4436 kfree(rbd_dev->opts);
4437 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004438}
4439
4440static void rbd_dev_release(struct device *dev)
4441{
4442 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4443 bool need_put = !!rbd_dev->opts;
4444
4445 if (need_put) {
4446 destroy_workqueue(rbd_dev->task_wq);
4447 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4448 }
4449
4450 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004451
4452 /*
4453 * This is racy, but way better than putting module outside of
4454 * the release callback. The race window is pretty small, so
4455 * doing something similar to dm (dm-builtin.c) is overkill.
4456 */
4457 if (need_put)
4458 module_put(THIS_MODULE);
4459}
4460
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004461static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4462 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004463{
4464 struct rbd_device *rbd_dev;
4465
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004466 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004467 if (!rbd_dev)
4468 return NULL;
4469
4470 spin_lock_init(&rbd_dev->lock);
4471 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004472 init_rwsem(&rbd_dev->header_rwsem);
4473
Ilya Dryomov7e973322017-01-25 18:16:22 +01004474 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004475 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004476 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004477 if (spec->pool_ns) {
4478 WARN_ON(!*spec->pool_ns);
4479 rbd_dev->header_oloc.pool_ns =
4480 ceph_find_or_create_string(spec->pool_ns,
4481 strlen(spec->pool_ns));
4482 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004483
Ilya Dryomov99d16942016-08-12 16:11:41 +02004484 mutex_init(&rbd_dev->watch_mutex);
4485 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4486 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4487
Ilya Dryomoved95b212016-08-12 16:40:02 +02004488 init_rwsem(&rbd_dev->lock_rwsem);
4489 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4490 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4491 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4492 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4493 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4494 init_waitqueue_head(&rbd_dev->lock_waitq);
4495
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004496 rbd_dev->dev.bus = &rbd_bus_type;
4497 rbd_dev->dev.type = &rbd_device_type;
4498 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004499 device_initialize(&rbd_dev->dev);
4500
Alex Elderc53d5892012-10-25 23:34:42 -05004501 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004502 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004503
Alex Elderc53d5892012-10-25 23:34:42 -05004504 return rbd_dev;
4505}
4506
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004507/*
4508 * Create a mapping rbd_dev.
4509 */
4510static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4511 struct rbd_spec *spec,
4512 struct rbd_options *opts)
4513{
4514 struct rbd_device *rbd_dev;
4515
4516 rbd_dev = __rbd_dev_create(rbdc, spec);
4517 if (!rbd_dev)
4518 return NULL;
4519
4520 rbd_dev->opts = opts;
4521
4522 /* get an id and fill in device name */
4523 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4524 minor_to_rbd_dev_id(1 << MINORBITS),
4525 GFP_KERNEL);
4526 if (rbd_dev->dev_id < 0)
4527 goto fail_rbd_dev;
4528
4529 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4530 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4531 rbd_dev->name);
4532 if (!rbd_dev->task_wq)
4533 goto fail_dev_id;
4534
4535 /* we have a ref from do_rbd_add() */
4536 __module_get(THIS_MODULE);
4537
4538 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4539 return rbd_dev;
4540
4541fail_dev_id:
4542 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4543fail_rbd_dev:
4544 rbd_dev_free(rbd_dev);
4545 return NULL;
4546}
4547
Alex Elderc53d5892012-10-25 23:34:42 -05004548static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4549{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004550 if (rbd_dev)
4551 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004552}
4553
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004554/*
Alex Elder9d475de2012-07-03 16:01:19 -05004555 * Get the size and object order for an image snapshot, or if
4556 * snap_id is CEPH_NOSNAP, gets this information for the base
4557 * image.
4558 */
4559static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4560 u8 *order, u64 *snap_size)
4561{
4562 __le64 snapid = cpu_to_le64(snap_id);
4563 int ret;
4564 struct {
4565 u8 order;
4566 __le64 size;
4567 } __attribute__ ((packed)) size_buf = { 0 };
4568
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004569 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4570 &rbd_dev->header_oloc, "get_size",
4571 &snapid, sizeof(snapid),
4572 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004573 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004574 if (ret < 0)
4575 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004576 if (ret < sizeof (size_buf))
4577 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004578
Josh Durginc3545572013-08-28 17:08:10 -07004579 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004580 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004581 dout(" order %u", (unsigned int)*order);
4582 }
Alex Elder9d475de2012-07-03 16:01:19 -05004583 *snap_size = le64_to_cpu(size_buf.size);
4584
Josh Durginc3545572013-08-28 17:08:10 -07004585 dout(" snap_id 0x%016llx snap_size = %llu\n",
4586 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004587 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004588
4589 return 0;
4590}
4591
4592static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4593{
4594 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4595 &rbd_dev->header.obj_order,
4596 &rbd_dev->header.image_size);
4597}
4598
Alex Elder1e130192012-07-03 16:01:19 -05004599static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4600{
4601 void *reply_buf;
4602 int ret;
4603 void *p;
4604
4605 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4606 if (!reply_buf)
4607 return -ENOMEM;
4608
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004609 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4610 &rbd_dev->header_oloc, "get_object_prefix",
4611 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004612 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004613 if (ret < 0)
4614 goto out;
4615
4616 p = reply_buf;
4617 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004618 p + ret, NULL, GFP_NOIO);
4619 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004620
4621 if (IS_ERR(rbd_dev->header.object_prefix)) {
4622 ret = PTR_ERR(rbd_dev->header.object_prefix);
4623 rbd_dev->header.object_prefix = NULL;
4624 } else {
4625 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4626 }
Alex Elder1e130192012-07-03 16:01:19 -05004627out:
4628 kfree(reply_buf);
4629
4630 return ret;
4631}
4632
Alex Elderb1b54022012-07-03 16:01:19 -05004633static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4634 u64 *snap_features)
4635{
4636 __le64 snapid = cpu_to_le64(snap_id);
4637 struct {
4638 __le64 features;
4639 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004640 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004641 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004642 int ret;
4643
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004644 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4645 &rbd_dev->header_oloc, "get_features",
4646 &snapid, sizeof(snapid),
4647 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004648 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004649 if (ret < 0)
4650 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004651 if (ret < sizeof (features_buf))
4652 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004653
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004654 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4655 if (unsup) {
4656 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4657 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004658 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004659 }
Alex Elderd8891402012-10-09 13:50:17 -07004660
Alex Elderb1b54022012-07-03 16:01:19 -05004661 *snap_features = le64_to_cpu(features_buf.features);
4662
4663 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004664 (unsigned long long)snap_id,
4665 (unsigned long long)*snap_features,
4666 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004667
4668 return 0;
4669}
4670
4671static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4672{
4673 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4674 &rbd_dev->header.features);
4675}
4676
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004677struct parent_image_info {
4678 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004679 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004680 const char *image_id;
4681 u64 snap_id;
4682
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004683 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004684 u64 overlap;
4685};
4686
4687/*
4688 * The caller is responsible for @pii.
4689 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004690static int decode_parent_image_spec(void **p, void *end,
4691 struct parent_image_info *pii)
4692{
4693 u8 struct_v;
4694 u32 struct_len;
4695 int ret;
4696
4697 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4698 &struct_v, &struct_len);
4699 if (ret)
4700 return ret;
4701
4702 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4703 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4704 if (IS_ERR(pii->pool_ns)) {
4705 ret = PTR_ERR(pii->pool_ns);
4706 pii->pool_ns = NULL;
4707 return ret;
4708 }
4709 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4710 if (IS_ERR(pii->image_id)) {
4711 ret = PTR_ERR(pii->image_id);
4712 pii->image_id = NULL;
4713 return ret;
4714 }
4715 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4716 return 0;
4717
4718e_inval:
4719 return -EINVAL;
4720}
4721
4722static int __get_parent_info(struct rbd_device *rbd_dev,
4723 struct page *req_page,
4724 struct page *reply_page,
4725 struct parent_image_info *pii)
4726{
4727 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4728 size_t reply_len = PAGE_SIZE;
4729 void *p, *end;
4730 int ret;
4731
4732 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4733 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4734 req_page, sizeof(u64), reply_page, &reply_len);
4735 if (ret)
4736 return ret == -EOPNOTSUPP ? 1 : ret;
4737
4738 p = page_address(reply_page);
4739 end = p + reply_len;
4740 ret = decode_parent_image_spec(&p, end, pii);
4741 if (ret)
4742 return ret;
4743
4744 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4745 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4746 req_page, sizeof(u64), reply_page, &reply_len);
4747 if (ret)
4748 return ret;
4749
4750 p = page_address(reply_page);
4751 end = p + reply_len;
4752 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4753 if (pii->has_overlap)
4754 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4755
4756 return 0;
4757
4758e_inval:
4759 return -EINVAL;
4760}
4761
4762/*
4763 * The caller is responsible for @pii.
4764 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004765static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4766 struct page *req_page,
4767 struct page *reply_page,
4768 struct parent_image_info *pii)
4769{
4770 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4771 size_t reply_len = PAGE_SIZE;
4772 void *p, *end;
4773 int ret;
4774
4775 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4776 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4777 req_page, sizeof(u64), reply_page, &reply_len);
4778 if (ret)
4779 return ret;
4780
4781 p = page_address(reply_page);
4782 end = p + reply_len;
4783 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4784 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4785 if (IS_ERR(pii->image_id)) {
4786 ret = PTR_ERR(pii->image_id);
4787 pii->image_id = NULL;
4788 return ret;
4789 }
4790 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004791 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004792 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4793
4794 return 0;
4795
4796e_inval:
4797 return -EINVAL;
4798}
4799
4800static int get_parent_info(struct rbd_device *rbd_dev,
4801 struct parent_image_info *pii)
4802{
4803 struct page *req_page, *reply_page;
4804 void *p;
4805 int ret;
4806
4807 req_page = alloc_page(GFP_KERNEL);
4808 if (!req_page)
4809 return -ENOMEM;
4810
4811 reply_page = alloc_page(GFP_KERNEL);
4812 if (!reply_page) {
4813 __free_page(req_page);
4814 return -ENOMEM;
4815 }
4816
4817 p = page_address(req_page);
4818 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004819 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
4820 if (ret > 0)
4821 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
4822 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004823
4824 __free_page(req_page);
4825 __free_page(reply_page);
4826 return ret;
4827}
4828
Alex Elder86b00e02012-10-25 23:34:42 -05004829static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
4830{
4831 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004832 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05004833 int ret;
4834
4835 parent_spec = rbd_spec_alloc();
4836 if (!parent_spec)
4837 return -ENOMEM;
4838
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004839 ret = get_parent_info(rbd_dev, &pii);
4840 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05004841 goto out_err;
4842
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004843 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
4844 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
4845 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004846
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004847 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05004848 /*
4849 * Either the parent never existed, or we have
4850 * record of it but the image got flattened so it no
4851 * longer has a parent. When the parent of a
4852 * layered image disappears we immediately set the
4853 * overlap to 0. The effect of this is that all new
4854 * requests will be treated as if the image had no
4855 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004856 *
4857 * If !pii.has_overlap, the parent image spec is not
4858 * applicable. It's there to avoid duplication in each
4859 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05004860 */
4861 if (rbd_dev->parent_overlap) {
4862 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05004863 rbd_dev_parent_put(rbd_dev);
4864 pr_info("%s: clone image has been flattened\n",
4865 rbd_dev->disk->disk_name);
4866 }
4867
Alex Elder86b00e02012-10-25 23:34:42 -05004868 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05004869 }
Alex Elder86b00e02012-10-25 23:34:42 -05004870
Alex Elder0903e872012-11-14 12:25:19 -06004871 /* The ceph file layout needs to fit pool id in 32 bits */
4872
4873 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004874 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04004875 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004876 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05004877 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004878 }
Alex Elder0903e872012-11-14 12:25:19 -06004879
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004880 /*
4881 * The parent won't change (except when the clone is
4882 * flattened, already handled that). So we only need to
4883 * record the parent spec we have not already done so.
4884 */
4885 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004886 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004887 if (pii.pool_ns && *pii.pool_ns) {
4888 parent_spec->pool_ns = pii.pool_ns;
4889 pii.pool_ns = NULL;
4890 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004891 parent_spec->image_id = pii.image_id;
4892 pii.image_id = NULL;
4893 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004894
Alex Elder70cf49c2013-05-06 17:40:33 -05004895 rbd_dev->parent_spec = parent_spec;
4896 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004897 }
4898
4899 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004900 * We always update the parent overlap. If it's zero we issue
4901 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004902 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004903 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004904 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004905 /* refresh, careful to warn just once */
4906 if (rbd_dev->parent_overlap)
4907 rbd_warn(rbd_dev,
4908 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004909 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004910 /* initial probe */
4911 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004912 }
Alex Elder70cf49c2013-05-06 17:40:33 -05004913 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004914 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03004915
Alex Elder86b00e02012-10-25 23:34:42 -05004916out:
4917 ret = 0;
4918out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004919 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004920 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05004921 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05004922 return ret;
4923}
4924
Alex Eldercc070d52013-04-21 12:14:45 -05004925static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
4926{
4927 struct {
4928 __le64 stripe_unit;
4929 __le64 stripe_count;
4930 } __attribute__ ((packed)) striping_info_buf = { 0 };
4931 size_t size = sizeof (striping_info_buf);
4932 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05004933 int ret;
4934
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004935 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4936 &rbd_dev->header_oloc, "get_stripe_unit_count",
4937 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05004938 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
4939 if (ret < 0)
4940 return ret;
4941 if (ret < size)
4942 return -ERANGE;
4943
Alex Eldercc070d52013-04-21 12:14:45 -05004944 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01004945 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
4946 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05004947 return 0;
4948}
4949
Ilya Dryomov7e973322017-01-25 18:16:22 +01004950static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
4951{
4952 __le64 data_pool_id;
4953 int ret;
4954
4955 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4956 &rbd_dev->header_oloc, "get_data_pool",
4957 NULL, 0, &data_pool_id, sizeof(data_pool_id));
4958 if (ret < 0)
4959 return ret;
4960 if (ret < sizeof(data_pool_id))
4961 return -EBADMSG;
4962
4963 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
4964 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
4965 return 0;
4966}
4967
Alex Elder9e15b772012-10-30 19:40:33 -05004968static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
4969{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004970 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05004971 size_t image_id_size;
4972 char *image_id;
4973 void *p;
4974 void *end;
4975 size_t size;
4976 void *reply_buf = NULL;
4977 size_t len = 0;
4978 char *image_name = NULL;
4979 int ret;
4980
4981 rbd_assert(!rbd_dev->spec->image_name);
4982
Alex Elder69e7a022012-11-01 08:39:26 -05004983 len = strlen(rbd_dev->spec->image_id);
4984 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05004985 image_id = kmalloc(image_id_size, GFP_KERNEL);
4986 if (!image_id)
4987 return NULL;
4988
4989 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05004990 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05004991 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05004992
4993 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
4994 reply_buf = kmalloc(size, GFP_KERNEL);
4995 if (!reply_buf)
4996 goto out;
4997
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004998 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
4999 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5000 "dir_get_name", image_id, image_id_size,
5001 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005002 if (ret < 0)
5003 goto out;
5004 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005005 end = reply_buf + ret;
5006
Alex Elder9e15b772012-10-30 19:40:33 -05005007 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5008 if (IS_ERR(image_name))
5009 image_name = NULL;
5010 else
5011 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5012out:
5013 kfree(reply_buf);
5014 kfree(image_id);
5015
5016 return image_name;
5017}
5018
Alex Elder2ad3d712013-04-30 00:44:33 -05005019static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5020{
5021 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5022 const char *snap_name;
5023 u32 which = 0;
5024
5025 /* Skip over names until we find the one we are looking for */
5026
5027 snap_name = rbd_dev->header.snap_names;
5028 while (which < snapc->num_snaps) {
5029 if (!strcmp(name, snap_name))
5030 return snapc->snaps[which];
5031 snap_name += strlen(snap_name) + 1;
5032 which++;
5033 }
5034 return CEPH_NOSNAP;
5035}
5036
5037static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5038{
5039 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5040 u32 which;
5041 bool found = false;
5042 u64 snap_id;
5043
5044 for (which = 0; !found && which < snapc->num_snaps; which++) {
5045 const char *snap_name;
5046
5047 snap_id = snapc->snaps[which];
5048 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005049 if (IS_ERR(snap_name)) {
5050 /* ignore no-longer existing snapshots */
5051 if (PTR_ERR(snap_name) == -ENOENT)
5052 continue;
5053 else
5054 break;
5055 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005056 found = !strcmp(name, snap_name);
5057 kfree(snap_name);
5058 }
5059 return found ? snap_id : CEPH_NOSNAP;
5060}
5061
5062/*
5063 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5064 * no snapshot by that name is found, or if an error occurs.
5065 */
5066static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5067{
5068 if (rbd_dev->image_format == 1)
5069 return rbd_v1_snap_id_by_name(rbd_dev, name);
5070
5071 return rbd_v2_snap_id_by_name(rbd_dev, name);
5072}
5073
Alex Elder9e15b772012-10-30 19:40:33 -05005074/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005075 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005076 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005077static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5078{
5079 struct rbd_spec *spec = rbd_dev->spec;
5080
5081 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5082 rbd_assert(spec->image_id && spec->image_name);
5083 rbd_assert(spec->snap_name);
5084
5085 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5086 u64 snap_id;
5087
5088 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5089 if (snap_id == CEPH_NOSNAP)
5090 return -ENOENT;
5091
5092 spec->snap_id = snap_id;
5093 } else {
5094 spec->snap_id = CEPH_NOSNAP;
5095 }
5096
5097 return 0;
5098}
5099
5100/*
5101 * A parent image will have all ids but none of the names.
5102 *
5103 * All names in an rbd spec are dynamically allocated. It's OK if we
5104 * can't figure out the name for an image id.
5105 */
5106static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005107{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005108 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5109 struct rbd_spec *spec = rbd_dev->spec;
5110 const char *pool_name;
5111 const char *image_name;
5112 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005113 int ret;
5114
Ilya Dryomov04077592014-07-23 17:11:20 +04005115 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5116 rbd_assert(spec->image_id);
5117 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005118
Alex Elder2e9f7f12013-04-26 09:43:48 -05005119 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005120
Alex Elder2e9f7f12013-04-26 09:43:48 -05005121 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5122 if (!pool_name) {
5123 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005124 return -EIO;
5125 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005126 pool_name = kstrdup(pool_name, GFP_KERNEL);
5127 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005128 return -ENOMEM;
5129
5130 /* Fetch the image name; tolerate failure here */
5131
Alex Elder2e9f7f12013-04-26 09:43:48 -05005132 image_name = rbd_dev_image_name(rbd_dev);
5133 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005134 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005135
Ilya Dryomov04077592014-07-23 17:11:20 +04005136 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005137
Alex Elder2e9f7f12013-04-26 09:43:48 -05005138 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005139 if (IS_ERR(snap_name)) {
5140 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005141 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005142 }
5143
5144 spec->pool_name = pool_name;
5145 spec->image_name = image_name;
5146 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005147
5148 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005149
Alex Elder9e15b772012-10-30 19:40:33 -05005150out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005151 kfree(image_name);
5152 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005153 return ret;
5154}
5155
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005156static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005157{
5158 size_t size;
5159 int ret;
5160 void *reply_buf;
5161 void *p;
5162 void *end;
5163 u64 seq;
5164 u32 snap_count;
5165 struct ceph_snap_context *snapc;
5166 u32 i;
5167
5168 /*
5169 * We'll need room for the seq value (maximum snapshot id),
5170 * snapshot count, and array of that many snapshot ids.
5171 * For now we have a fixed upper limit on the number we're
5172 * prepared to receive.
5173 */
5174 size = sizeof (__le64) + sizeof (__le32) +
5175 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5176 reply_buf = kzalloc(size, GFP_KERNEL);
5177 if (!reply_buf)
5178 return -ENOMEM;
5179
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005180 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5181 &rbd_dev->header_oloc, "get_snapcontext",
5182 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005183 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005184 if (ret < 0)
5185 goto out;
5186
Alex Elder35d489f2012-07-03 16:01:19 -05005187 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005188 end = reply_buf + ret;
5189 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005190 ceph_decode_64_safe(&p, end, seq, out);
5191 ceph_decode_32_safe(&p, end, snap_count, out);
5192
5193 /*
5194 * Make sure the reported number of snapshot ids wouldn't go
5195 * beyond the end of our buffer. But before checking that,
5196 * make sure the computed size of the snapshot context we
5197 * allocate is representable in a size_t.
5198 */
5199 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5200 / sizeof (u64)) {
5201 ret = -EINVAL;
5202 goto out;
5203 }
5204 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5205 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005206 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005207
Alex Elder812164f82013-04-30 00:44:32 -05005208 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005209 if (!snapc) {
5210 ret = -ENOMEM;
5211 goto out;
5212 }
Alex Elder35d489f2012-07-03 16:01:19 -05005213 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005214 for (i = 0; i < snap_count; i++)
5215 snapc->snaps[i] = ceph_decode_64(&p);
5216
Alex Elder49ece552013-05-06 08:37:00 -05005217 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005218 rbd_dev->header.snapc = snapc;
5219
5220 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005221 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005222out:
5223 kfree(reply_buf);
5224
Alex Elder57385b52013-04-21 12:14:45 -05005225 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005226}
5227
Alex Elder54cac612013-04-30 00:44:33 -05005228static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5229 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005230{
5231 size_t size;
5232 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005233 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005234 int ret;
5235 void *p;
5236 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005237 char *snap_name;
5238
5239 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5240 reply_buf = kmalloc(size, GFP_KERNEL);
5241 if (!reply_buf)
5242 return ERR_PTR(-ENOMEM);
5243
Alex Elder54cac612013-04-30 00:44:33 -05005244 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005245 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5246 &rbd_dev->header_oloc, "get_snapshot_name",
5247 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005248 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005249 if (ret < 0) {
5250 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005251 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005252 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005253
5254 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005255 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005256 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005257 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005258 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005259
Alex Elderf40eb342013-04-25 15:09:42 -05005260 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005261 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005262out:
5263 kfree(reply_buf);
5264
Alex Elderf40eb342013-04-25 15:09:42 -05005265 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005266}
5267
Alex Elder2df3fac2013-05-06 09:51:30 -05005268static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005269{
Alex Elder2df3fac2013-05-06 09:51:30 -05005270 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005271 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005272
Josh Durgin1617e402013-06-12 14:43:10 -07005273 ret = rbd_dev_v2_image_size(rbd_dev);
5274 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005275 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005276
Alex Elder2df3fac2013-05-06 09:51:30 -05005277 if (first_time) {
5278 ret = rbd_dev_v2_header_onetime(rbd_dev);
5279 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005280 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005281 }
5282
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005283 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005284 if (ret && first_time) {
5285 kfree(rbd_dev->header.object_prefix);
5286 rbd_dev->header.object_prefix = NULL;
5287 }
Alex Elder117973f2012-08-31 17:29:55 -05005288
5289 return ret;
5290}
5291
Ilya Dryomova720ae02014-07-23 17:11:19 +04005292static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5293{
5294 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5295
5296 if (rbd_dev->image_format == 1)
5297 return rbd_dev_v1_header_info(rbd_dev);
5298
5299 return rbd_dev_v2_header_info(rbd_dev);
5300}
5301
Alex Elder1ddbe942012-01-29 13:57:44 -06005302/*
Alex Eldere28fff262012-02-02 08:13:30 -06005303 * Skips over white space at *buf, and updates *buf to point to the
5304 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005305 * the token (string of non-white space characters) found. Note
5306 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005307 */
5308static inline size_t next_token(const char **buf)
5309{
5310 /*
5311 * These are the characters that produce nonzero for
5312 * isspace() in the "C" and "POSIX" locales.
5313 */
5314 const char *spaces = " \f\n\r\t\v";
5315
5316 *buf += strspn(*buf, spaces); /* Find start of token */
5317
5318 return strcspn(*buf, spaces); /* Return token length */
5319}
5320
5321/*
Alex Elderea3352f2012-07-09 21:04:23 -05005322 * Finds the next token in *buf, dynamically allocates a buffer big
5323 * enough to hold a copy of it, and copies the token into the new
5324 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5325 * that a duplicate buffer is created even for a zero-length token.
5326 *
5327 * Returns a pointer to the newly-allocated duplicate, or a null
5328 * pointer if memory for the duplicate was not available. If
5329 * the lenp argument is a non-null pointer, the length of the token
5330 * (not including the '\0') is returned in *lenp.
5331 *
5332 * If successful, the *buf pointer will be updated to point beyond
5333 * the end of the found token.
5334 *
5335 * Note: uses GFP_KERNEL for allocation.
5336 */
5337static inline char *dup_token(const char **buf, size_t *lenp)
5338{
5339 char *dup;
5340 size_t len;
5341
5342 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005343 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005344 if (!dup)
5345 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005346 *(dup + len) = '\0';
5347 *buf += len;
5348
5349 if (lenp)
5350 *lenp = len;
5351
5352 return dup;
5353}
5354
5355/*
Alex Elder859c31d2012-10-25 23:34:42 -05005356 * Parse the options provided for an "rbd add" (i.e., rbd image
5357 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5358 * and the data written is passed here via a NUL-terminated buffer.
5359 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005360 *
Alex Elder859c31d2012-10-25 23:34:42 -05005361 * The information extracted from these options is recorded in
5362 * the other parameters which return dynamically-allocated
5363 * structures:
5364 * ceph_opts
5365 * The address of a pointer that will refer to a ceph options
5366 * structure. Caller must release the returned pointer using
5367 * ceph_destroy_options() when it is no longer needed.
5368 * rbd_opts
5369 * Address of an rbd options pointer. Fully initialized by
5370 * this function; caller must release with kfree().
5371 * spec
5372 * Address of an rbd image specification pointer. Fully
5373 * initialized by this function based on parsed options.
5374 * Caller must release with rbd_spec_put().
5375 *
5376 * The options passed take this form:
5377 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5378 * where:
5379 * <mon_addrs>
5380 * A comma-separated list of one or more monitor addresses.
5381 * A monitor address is an ip address, optionally followed
5382 * by a port number (separated by a colon).
5383 * I.e.: ip1[:port1][,ip2[:port2]...]
5384 * <options>
5385 * A comma-separated list of ceph and/or rbd options.
5386 * <pool_name>
5387 * The name of the rados pool containing the rbd image.
5388 * <image_name>
5389 * The name of the image in that pool to map.
5390 * <snap_id>
5391 * An optional snapshot id. If provided, the mapping will
5392 * present data from the image at the time that snapshot was
5393 * created. The image head is used if no snapshot id is
5394 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005395 */
Alex Elder859c31d2012-10-25 23:34:42 -05005396static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005397 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005398 struct rbd_options **opts,
5399 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005400{
Alex Elderd22f76e2012-07-12 10:46:35 -05005401 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005402 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005403 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005404 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005405 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005406 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005407 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005408 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005409
5410 /* The first four tokens are required */
5411
Alex Elder7ef32142012-02-02 08:13:30 -06005412 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005413 if (!len) {
5414 rbd_warn(NULL, "no monitor address(es) provided");
5415 return -EINVAL;
5416 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005417 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005418 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005419 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005420
Alex Elderdc79b112012-10-25 23:34:41 -05005421 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005422 options = dup_token(&buf, NULL);
5423 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005424 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005425 if (!*options) {
5426 rbd_warn(NULL, "no options provided");
5427 goto out_err;
5428 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005429
Ilya Dryomovc3001562018-07-03 15:28:43 +02005430 pctx.spec = rbd_spec_alloc();
5431 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005432 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005433
Ilya Dryomovc3001562018-07-03 15:28:43 +02005434 pctx.spec->pool_name = dup_token(&buf, NULL);
5435 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005436 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005437 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005438 rbd_warn(NULL, "no pool name provided");
5439 goto out_err;
5440 }
Alex Eldere28fff262012-02-02 08:13:30 -06005441
Ilya Dryomovc3001562018-07-03 15:28:43 +02005442 pctx.spec->image_name = dup_token(&buf, NULL);
5443 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005444 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005445 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005446 rbd_warn(NULL, "no image name provided");
5447 goto out_err;
5448 }
Alex Eldere28fff262012-02-02 08:13:30 -06005449
Alex Elderf28e5652012-10-25 23:34:41 -05005450 /*
5451 * Snapshot name is optional; default is to use "-"
5452 * (indicating the head/no snapshot).
5453 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005454 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005455 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005456 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5457 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005458 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005459 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005460 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005461 }
Alex Elderecb4dc22013-04-26 09:43:47 -05005462 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5463 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005464 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05005465 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005466 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005467
Alex Elder0ddebc02012-10-25 23:34:41 -05005468 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005469
Ilya Dryomovc3001562018-07-03 15:28:43 +02005470 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5471 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005472 goto out_mem;
5473
Ilya Dryomovc3001562018-07-03 15:28:43 +02005474 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5475 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01005476 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005477 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5478 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5479 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5480 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005481
Alex Elder859c31d2012-10-25 23:34:42 -05005482 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005483 mon_addrs + mon_addrs_size - 1,
5484 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005485 if (IS_ERR(copts)) {
5486 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005487 goto out_err;
5488 }
Alex Elder859c31d2012-10-25 23:34:42 -05005489 kfree(options);
5490
5491 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005492 *opts = pctx.opts;
5493 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005494
Alex Elderdc79b112012-10-25 23:34:41 -05005495 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005496out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005497 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005498out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005499 kfree(pctx.opts);
5500 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005501 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005502
Alex Elderdc79b112012-10-25 23:34:41 -05005503 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005504}
5505
Ilya Dryomove010dd02017-04-13 12:17:39 +02005506static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5507{
5508 down_write(&rbd_dev->lock_rwsem);
5509 if (__rbd_is_lock_owner(rbd_dev))
5510 rbd_unlock(rbd_dev);
5511 up_write(&rbd_dev->lock_rwsem);
5512}
5513
5514static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5515{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005516 int ret;
5517
Ilya Dryomove010dd02017-04-13 12:17:39 +02005518 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5519 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5520 return -EINVAL;
5521 }
5522
5523 /* FIXME: "rbd map --exclusive" should be in interruptible */
5524 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005525 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005526 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005527 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005528 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5529 return -EROFS;
5530 }
5531
5532 return 0;
5533}
5534
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005535/*
Alex Elder589d30e2012-07-10 20:30:11 -05005536 * An rbd format 2 image has a unique identifier, distinct from the
5537 * name given to it by the user. Internally, that identifier is
5538 * what's used to specify the names of objects related to the image.
5539 *
5540 * A special "rbd id" object is used to map an rbd image name to its
5541 * id. If that object doesn't exist, then there is no v2 rbd image
5542 * with the supplied name.
5543 *
5544 * This function will record the given rbd_dev's image_id field if
5545 * it can be determined, and in that case will return 0. If any
5546 * errors occur a negative errno will be returned and the rbd_dev's
5547 * image_id field will be unchanged (and should be NULL).
5548 */
5549static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5550{
5551 int ret;
5552 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005553 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005554 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005555 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005556
Alex Elder589d30e2012-07-10 20:30:11 -05005557 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005558 * When probing a parent image, the image id is already
5559 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005560 * need to fetch the image id again in this case. We
5561 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005562 */
Alex Elderc0fba362013-04-25 23:15:08 -05005563 if (rbd_dev->spec->image_id) {
5564 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5565
Alex Elder2c0d0a12012-10-30 19:40:33 -05005566 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005567 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005568
5569 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005570 * First, see if the format 2 image id file exists, and if
5571 * so, get the image's persistent id from it.
5572 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005573 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5574 rbd_dev->spec->image_name);
5575 if (ret)
5576 return ret;
5577
5578 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005579
5580 /* Response will be an encoded string, which includes a length */
5581
5582 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5583 response = kzalloc(size, GFP_NOIO);
5584 if (!response) {
5585 ret = -ENOMEM;
5586 goto out;
5587 }
5588
Alex Elderc0fba362013-04-25 23:15:08 -05005589 /* If it doesn't exist we'll assume it's a format 1 image */
5590
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005591 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5592 "get_id", NULL, 0,
5593 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005594 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005595 if (ret == -ENOENT) {
5596 image_id = kstrdup("", GFP_KERNEL);
5597 ret = image_id ? 0 : -ENOMEM;
5598 if (!ret)
5599 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005600 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005601 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005602
Alex Elderc0fba362013-04-25 23:15:08 -05005603 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005604 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005605 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005606 if (!ret)
5607 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005608 }
5609
5610 if (!ret) {
5611 rbd_dev->spec->image_id = image_id;
5612 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005613 }
5614out:
5615 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005616 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005617 return ret;
5618}
5619
Alex Elder3abef3b2013-05-13 20:35:37 -05005620/*
5621 * Undo whatever state changes are made by v1 or v2 header info
5622 * call.
5623 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005624static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5625{
5626 struct rbd_image_header *header;
5627
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005628 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005629
5630 /* Free dynamic fields from the header, then zero it out */
5631
5632 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005633 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005634 kfree(header->snap_sizes);
5635 kfree(header->snap_names);
5636 kfree(header->object_prefix);
5637 memset(header, 0, sizeof (*header));
5638}
5639
Alex Elder2df3fac2013-05-06 09:51:30 -05005640static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005641{
5642 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005643
Alex Elder1e130192012-07-03 16:01:19 -05005644 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005645 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005646 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005647
Alex Elder2df3fac2013-05-06 09:51:30 -05005648 /*
5649 * Get the and check features for the image. Currently the
5650 * features are assumed to never change.
5651 */
Alex Elderb1b54022012-07-03 16:01:19 -05005652 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005653 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005654 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005655
Alex Eldercc070d52013-04-21 12:14:45 -05005656 /* If the image supports fancy striping, get its parameters */
5657
5658 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5659 ret = rbd_dev_v2_striping_info(rbd_dev);
5660 if (ret < 0)
5661 goto out_err;
5662 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005663
Ilya Dryomov7e973322017-01-25 18:16:22 +01005664 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5665 ret = rbd_dev_v2_data_pool(rbd_dev);
5666 if (ret)
5667 goto out_err;
5668 }
5669
Ilya Dryomov263423f2017-01-25 18:16:22 +01005670 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005671 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005672
Alex Elder9d475de2012-07-03 16:01:19 -05005673out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005674 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005675 kfree(rbd_dev->header.object_prefix);
5676 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005677 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005678}
5679
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005680/*
5681 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5682 * rbd_dev_image_probe() recursion depth, which means it's also the
5683 * length of the already discovered part of the parent chain.
5684 */
5685static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005686{
Alex Elder2f82ee52012-10-30 19:40:33 -05005687 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005688 int ret;
5689
5690 if (!rbd_dev->parent_spec)
5691 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005692
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005693 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5694 pr_info("parent chain is too long (%d)\n", depth);
5695 ret = -EINVAL;
5696 goto out_err;
5697 }
5698
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005699 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005700 if (!parent) {
5701 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005702 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005703 }
5704
5705 /*
5706 * Images related by parent/child relationships always share
5707 * rbd_client and spec/parent_spec, so bump their refcounts.
5708 */
5709 __rbd_get_client(rbd_dev->rbd_client);
5710 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005711
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005712 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005713 if (ret < 0)
5714 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005715
Alex Elder124afba2013-04-26 15:44:36 -05005716 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005717 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005718 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005719
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005720out_err:
5721 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005722 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005723 return ret;
5724}
5725
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005726static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5727{
5728 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5729 rbd_dev_mapping_clear(rbd_dev);
5730 rbd_free_disk(rbd_dev);
5731 if (!single_major)
5732 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5733}
5734
Ilya Dryomov811c6682016-04-15 16:22:16 +02005735/*
5736 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5737 * upon return.
5738 */
Alex Elder200a6a82013-04-28 23:32:34 -05005739static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005740{
Alex Elder83a06262012-10-30 15:47:17 -05005741 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005742
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005743 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005744
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005745 if (!single_major) {
5746 ret = register_blkdev(0, rbd_dev->name);
5747 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005748 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005749
5750 rbd_dev->major = ret;
5751 rbd_dev->minor = 0;
5752 } else {
5753 rbd_dev->major = rbd_major;
5754 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5755 }
Alex Elder83a06262012-10-30 15:47:17 -05005756
5757 /* Set up the blkdev mapping. */
5758
5759 ret = rbd_init_disk(rbd_dev);
5760 if (ret)
5761 goto err_out_blkdev;
5762
Alex Elderf35a4de2013-05-06 09:51:29 -05005763 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005764 if (ret)
5765 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005766
Alex Elderf35a4de2013-05-06 09:51:29 -05005767 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005768 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005769
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005770 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005771 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005772 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005773
Alex Elder129b79d2013-04-26 15:44:36 -05005774 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005775 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005776 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005777
Alex Elderf35a4de2013-05-06 09:51:29 -05005778err_out_mapping:
5779 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005780err_out_disk:
5781 rbd_free_disk(rbd_dev);
5782err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005783 if (!single_major)
5784 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005785err_out_unlock:
5786 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005787 return ret;
5788}
5789
Alex Elder332bb122013-04-27 09:59:30 -05005790static int rbd_dev_header_name(struct rbd_device *rbd_dev)
5791{
5792 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005793 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05005794
5795 /* Record the header object name for this rbd image. */
5796
5797 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05005798 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005799 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5800 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05005801 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005802 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
5803 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05005804
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005805 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05005806}
5807
Alex Elder200a6a82013-04-28 23:32:34 -05005808static void rbd_dev_image_release(struct rbd_device *rbd_dev)
5809{
Alex Elder6fd48b32013-04-28 23:32:34 -05005810 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005811 if (rbd_dev->opts)
5812 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005813 rbd_dev->image_format = 0;
5814 kfree(rbd_dev->spec->image_id);
5815 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05005816}
5817
Alex Eldera30b71b2012-07-10 20:30:11 -05005818/*
5819 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05005820 * device. If this image is the one being mapped (i.e., not a
5821 * parent), initiate a watch on its header object before using that
5822 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05005823 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005824static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05005825{
5826 int ret;
5827
5828 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05005829 * Get the id from the image id object. Unless there's an
5830 * error, rbd_dev->spec->image_id will be filled in with
5831 * a dynamically-allocated string, and rbd_dev->image_format
5832 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05005833 */
5834 ret = rbd_dev_image_id(rbd_dev);
5835 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05005836 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05005837
Alex Elder332bb122013-04-27 09:59:30 -05005838 ret = rbd_dev_header_name(rbd_dev);
5839 if (ret)
5840 goto err_out_format;
5841
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005842 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02005843 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005844 if (ret) {
5845 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005846 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005847 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005848 rbd_dev->spec->pool_ns ?: "",
5849 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005850 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005851 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005852 }
Alex Elder1f3ef782013-05-06 17:40:33 -05005853 }
Alex Elderb644de22013-04-27 09:59:31 -05005854
Ilya Dryomova720ae02014-07-23 17:11:19 +04005855 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05005856 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05005857 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05005858
Ilya Dryomov04077592014-07-23 17:11:20 +04005859 /*
5860 * If this image is the one being mapped, we have pool name and
5861 * id, image name and id, and snap name - need to fill snap id.
5862 * Otherwise this is a parent image, identified by pool, image
5863 * and snap ids - need to fill in names for those ids.
5864 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005865 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04005866 ret = rbd_spec_fill_snap_id(rbd_dev);
5867 else
5868 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005869 if (ret) {
5870 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005871 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005872 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005873 rbd_dev->spec->pool_ns ?: "",
5874 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005875 rbd_dev->spec->image_name,
5876 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05005877 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005878 }
Alex Elder9bb81c92013-04-27 09:59:30 -05005879
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005880 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
5881 ret = rbd_dev_v2_parent_info(rbd_dev);
5882 if (ret)
5883 goto err_out_probe;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005884 }
5885
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005886 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05005887 if (ret)
5888 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05005889
Alex Elder30d60ba2013-05-06 09:51:30 -05005890 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005891 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05005892 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04005893
Alex Elder6fd48b32013-04-28 23:32:34 -05005894err_out_probe:
5895 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05005896err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005897 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02005898 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05005899err_out_format:
5900 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05005901 kfree(rbd_dev->spec->image_id);
5902 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05005903 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005904}
5905
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005906static ssize_t do_rbd_add(struct bus_type *bus,
5907 const char *buf,
5908 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005909{
Alex Eldercb8627c2012-07-09 21:04:23 -05005910 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05005911 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005912 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005913 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05005914 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005915 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005916
5917 if (!try_module_get(THIS_MODULE))
5918 return -ENODEV;
5919
Alex Eldera725f65e2012-02-02 08:13:30 -06005920 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05005921 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05005922 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005923 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06005924
Alex Elder9d3997f2012-10-25 23:34:42 -05005925 rbdc = rbd_get_client(ceph_opts);
5926 if (IS_ERR(rbdc)) {
5927 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05005928 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05005929 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005930
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005931 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01005932 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005933 if (rc < 0) {
5934 if (rc == -ENOENT)
5935 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005936 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03005937 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05005938 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05005939
Ilya Dryomovd1475432015-06-22 13:24:48 +03005940 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005941 if (!rbd_dev) {
5942 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05005943 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02005944 }
Alex Elderc53d5892012-10-25 23:34:42 -05005945 rbdc = NULL; /* rbd_dev now owns this */
5946 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03005947 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07005948
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005949 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
5950 if (!rbd_dev->config_info) {
5951 rc = -ENOMEM;
5952 goto err_out_rbd_dev;
5953 }
5954
Ilya Dryomov811c6682016-04-15 16:22:16 +02005955 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005956 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005957 if (rc < 0) {
5958 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05005959 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005960 }
Alex Elder05fd6f62012-08-29 17:11:07 -05005961
Alex Elder7ce4eef2013-05-06 17:40:33 -05005962 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05005963 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02005964 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05005965
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01005966 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
5967 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
5968 rbd_dev->layout.object_size);
5969 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
5970 }
5971
Alex Elderb536f692013-04-28 23:32:34 -05005972 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02005973 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02005974 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05005975
Ilya Dryomove010dd02017-04-13 12:17:39 +02005976 if (rbd_dev->opts->exclusive) {
5977 rc = rbd_add_acquire_lock(rbd_dev);
5978 if (rc)
5979 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05005980 }
5981
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005982 /* Everything's ready. Announce the disk to the world. */
5983
5984 rc = device_add(&rbd_dev->dev);
5985 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02005986 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005987
5988 add_disk(rbd_dev->disk);
5989 /* see rbd_init_disk() */
5990 blk_put_queue(rbd_dev->disk->queue);
5991
5992 spin_lock(&rbd_dev_list_lock);
5993 list_add_tail(&rbd_dev->node, &rbd_dev_list);
5994 spin_unlock(&rbd_dev_list_lock);
5995
5996 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
5997 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
5998 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005999 rc = count;
6000out:
6001 module_put(THIS_MODULE);
6002 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006003
Ilya Dryomove010dd02017-04-13 12:17:39 +02006004err_out_image_lock:
6005 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006006err_out_device_setup:
6007 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006008err_out_image_probe:
6009 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05006010err_out_rbd_dev:
6011 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006012err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006013 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006014err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006015 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006016 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006017 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006018}
6019
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006020static ssize_t rbd_add(struct bus_type *bus,
6021 const char *buf,
6022 size_t count)
6023{
6024 if (single_major)
6025 return -EINVAL;
6026
6027 return do_rbd_add(bus, buf, count);
6028}
6029
6030static ssize_t rbd_add_single_major(struct bus_type *bus,
6031 const char *buf,
6032 size_t count)
6033{
6034 return do_rbd_add(bus, buf, count);
6035}
6036
Alex Elder05a46af2013-04-26 15:44:36 -05006037static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6038{
Alex Elderad945fc2013-04-26 15:44:36 -05006039 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006040 struct rbd_device *first = rbd_dev;
6041 struct rbd_device *second = first->parent;
6042 struct rbd_device *third;
6043
6044 /*
6045 * Follow to the parent with no grandparent and
6046 * remove it.
6047 */
6048 while (second && (third = second->parent)) {
6049 first = second;
6050 second = third;
6051 }
Alex Elderad945fc2013-04-26 15:44:36 -05006052 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006053 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006054 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006055 first->parent = NULL;
6056 first->parent_overlap = 0;
6057
6058 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006059 rbd_spec_put(first->parent_spec);
6060 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006061 }
6062}
6063
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006064static ssize_t do_rbd_remove(struct bus_type *bus,
6065 const char *buf,
6066 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006067{
6068 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006069 struct list_head *tmp;
6070 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006071 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02006072 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006073 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006074
Mike Christie0276dca2016-08-18 18:38:45 +02006075 dev_id = -1;
6076 opt_buf[0] = '\0';
6077 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6078 if (dev_id < 0) {
6079 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006080 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006081 }
6082 if (opt_buf[0] != '\0') {
6083 if (!strcmp(opt_buf, "force")) {
6084 force = true;
6085 } else {
6086 pr_err("bad remove option at '%s'\n", opt_buf);
6087 return -EINVAL;
6088 }
6089 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006090
Alex Elder751cc0e2013-05-31 15:17:01 -05006091 ret = -ENOENT;
6092 spin_lock(&rbd_dev_list_lock);
6093 list_for_each(tmp, &rbd_dev_list) {
6094 rbd_dev = list_entry(tmp, struct rbd_device, node);
6095 if (rbd_dev->dev_id == dev_id) {
6096 ret = 0;
6097 break;
6098 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006099 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006100 if (!ret) {
6101 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006102 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006103 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006104 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6105 &rbd_dev->flags))
6106 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05006107 spin_unlock_irq(&rbd_dev->lock);
6108 }
6109 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006110 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006111 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006112
Mike Christie0276dca2016-08-18 18:38:45 +02006113 if (force) {
6114 /*
6115 * Prevent new IO from being queued and wait for existing
6116 * IO to complete/fail.
6117 */
6118 blk_mq_freeze_queue(rbd_dev->disk->queue);
6119 blk_set_queue_dying(rbd_dev->disk->queue);
6120 }
6121
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006122 del_gendisk(rbd_dev->disk);
6123 spin_lock(&rbd_dev_list_lock);
6124 list_del_init(&rbd_dev->node);
6125 spin_unlock(&rbd_dev_list_lock);
6126 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006127
Ilya Dryomove010dd02017-04-13 12:17:39 +02006128 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006129 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006130 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006131 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006132 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006133}
6134
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006135static ssize_t rbd_remove(struct bus_type *bus,
6136 const char *buf,
6137 size_t count)
6138{
6139 if (single_major)
6140 return -EINVAL;
6141
6142 return do_rbd_remove(bus, buf, count);
6143}
6144
6145static ssize_t rbd_remove_single_major(struct bus_type *bus,
6146 const char *buf,
6147 size_t count)
6148{
6149 return do_rbd_remove(bus, buf, count);
6150}
6151
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006152/*
6153 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006154 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006155 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006156static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006157{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006158 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006159
Alex Elderfed4c142012-02-07 12:03:36 -06006160 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006161 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006162 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006163
Alex Elderfed4c142012-02-07 12:03:36 -06006164 ret = bus_register(&rbd_bus_type);
6165 if (ret < 0)
6166 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006167
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006168 return ret;
6169}
6170
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006171static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006172{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006173 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006174 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006175}
6176
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006177static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006178{
6179 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006180 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006181 if (!rbd_img_request_cache)
6182 return -ENOMEM;
6183
6184 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006185 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006186 if (!rbd_obj_request_cache)
6187 goto out_err;
6188
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006189 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006190
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006191out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006192 kmem_cache_destroy(rbd_img_request_cache);
6193 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006194 return -ENOMEM;
6195}
6196
6197static void rbd_slab_exit(void)
6198{
Alex Elder868311b2013-05-01 12:43:03 -05006199 rbd_assert(rbd_obj_request_cache);
6200 kmem_cache_destroy(rbd_obj_request_cache);
6201 rbd_obj_request_cache = NULL;
6202
Alex Elder1c2a9df2013-05-01 12:43:03 -05006203 rbd_assert(rbd_img_request_cache);
6204 kmem_cache_destroy(rbd_img_request_cache);
6205 rbd_img_request_cache = NULL;
6206}
6207
Alex Eldercc344fa2013-02-19 12:25:56 -06006208static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006209{
6210 int rc;
6211
Alex Elder1e32d342013-01-30 11:13:33 -06006212 if (!libceph_compatible(NULL)) {
6213 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006214 return -EINVAL;
6215 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006216
Alex Elder1c2a9df2013-05-01 12:43:03 -05006217 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006218 if (rc)
6219 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006220
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006221 /*
6222 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006223 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006224 */
6225 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6226 if (!rbd_wq) {
6227 rc = -ENOMEM;
6228 goto err_out_slab;
6229 }
6230
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006231 if (single_major) {
6232 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6233 if (rbd_major < 0) {
6234 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006235 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006236 }
6237 }
6238
Alex Elder1c2a9df2013-05-01 12:43:03 -05006239 rc = rbd_sysfs_init();
6240 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006241 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006242
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006243 if (single_major)
6244 pr_info("loaded (major %d)\n", rbd_major);
6245 else
6246 pr_info("loaded\n");
6247
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006248 return 0;
6249
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006250err_out_blkdev:
6251 if (single_major)
6252 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006253err_out_wq:
6254 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006255err_out_slab:
6256 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006257 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006258}
6259
Alex Eldercc344fa2013-02-19 12:25:56 -06006260static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006261{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006262 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006263 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006264 if (single_major)
6265 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006266 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006267 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006268}
6269
6270module_init(rbd_init);
6271module_exit(rbd_exit);
6272
Alex Elderd552c612013-05-31 20:13:09 -05006273MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006274MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6275MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006276/* following authorship retained from original osdblk.c */
6277MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6278
Ilya Dryomov90da2582013-12-13 15:28:56 +02006279MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006280MODULE_LICENSE("GPL");