blob: 1bffad122dc229fc0fd0a809ab867da9be4a52d0 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070035#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070036#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050037#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070038
39#include <linux/kernel.h>
40#include <linux/device.h>
41#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010042#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070043#include <linux/fs.h>
44#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050045#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020046#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040047#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070048
49#include "rbd_types.h"
50
Alex Elderaafb2302012-09-06 16:00:54 -050051#define RBD_DEBUG /* Activate rbd_assert() calls */
52
Alex Elder593a9e72012-02-07 12:03:37 -060053/*
54 * The basic unit of block I/O is a sector. It is interpreted in a
55 * number of contexts in Linux (blk, bio, genhd), but the default is
56 * universally 512 bytes. These symbols are just slightly more
57 * meaningful than the bare numbers they represent.
58 */
59#define SECTOR_SHIFT 9
60#define SECTOR_SIZE (1ULL << SECTOR_SHIFT)
61
Alex Eldera2acd002013-05-08 22:50:04 -050062/*
63 * Increment the given counter and return its updated value.
64 * If the counter is already 0 it will not be incremented.
65 * If the counter is already at its maximum value returns
66 * -EINVAL without updating it.
67 */
68static int atomic_inc_return_safe(atomic_t *v)
69{
70 unsigned int counter;
71
72 counter = (unsigned int)__atomic_add_unless(v, 1, 0);
73 if (counter <= (unsigned int)INT_MAX)
74 return (int)counter;
75
76 atomic_dec(v);
77
78 return -EINVAL;
79}
80
81/* Decrement the counter. Return the resulting value, or -EINVAL */
82static int atomic_dec_return_safe(atomic_t *v)
83{
84 int counter;
85
86 counter = atomic_dec_return(v);
87 if (counter >= 0)
88 return counter;
89
90 atomic_inc(v);
91
92 return -EINVAL;
93}
94
Alex Elderf0f8cef2012-01-29 13:57:44 -060095#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070096
Ilya Dryomov7e513d42013-12-16 19:26:32 +020097#define RBD_MINORS_PER_MAJOR 256
98#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200100#define RBD_MAX_PARENT_CHAIN_LEN 16
101
Alex Elderd4b125e2012-07-03 16:01:19 -0500102#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
103#define RBD_MAX_SNAP_NAME_LEN \
104 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
105
Alex Elder35d489f2012-07-03 16:01:19 -0500106#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700107
108#define RBD_SNAP_HEAD_NAME "-"
109
Alex Elder9682fc62013-04-30 00:44:33 -0500110#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
111
Alex Elder9e15b772012-10-30 19:40:33 -0500112/* This allows a single page to hold an image name sent by OSD */
113#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500114#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500115
Alex Elder1e130192012-07-03 16:01:19 -0500116#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500117
Ilya Dryomoved95b212016-08-12 16:40:02 +0200118#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200119#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
120
Alex Elderd8891402012-10-09 13:50:17 -0700121/* Feature bits */
122
Ilya Dryomov8767b292017-03-02 19:56:57 +0100123#define RBD_FEATURE_LAYERING (1ULL<<0)
124#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
125#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
126#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100127#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100128
Ilya Dryomoved95b212016-08-12 16:40:02 +0200129#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
130 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100131 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100132 RBD_FEATURE_DATA_POOL | \
133 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700134
135/* Features supported by this (client software) implementation. */
136
Alex Elder770eba62012-10-25 23:34:40 -0500137#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700138
Alex Elder81a89792012-02-02 08:13:30 -0600139/*
140 * An RBD device name will be "rbd#", where the "rbd" comes from
141 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600142 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700143#define DEV_NAME_LEN 32
144
145/*
146 * block device image metadata (in-memory version)
147 */
148struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500150 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700151 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500152 u64 stripe_unit;
153 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100154 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500155 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700156
Alex Elderf84344f2012-08-31 17:29:51 -0500157 /* The remaining fields need to be updated occasionally */
158 u64 image_size;
159 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500160 char *snap_names; /* format 1 only */
161 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700162};
163
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500164/*
165 * An rbd image specification.
166 *
167 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500168 * identify an image. Each rbd_dev structure includes a pointer to
169 * an rbd_spec structure that encapsulates this identity.
170 *
171 * Each of the id's in an rbd_spec has an associated name. For a
172 * user-mapped image, the names are supplied and the id's associated
173 * with them are looked up. For a layered image, a parent image is
174 * defined by the tuple, and the names are looked up.
175 *
176 * An rbd_dev structure contains a parent_spec pointer which is
177 * non-null if the image it represents is a child in a layered
178 * image. This pointer will refer to the rbd_spec structure used
179 * by the parent rbd_dev for its own identity (i.e., the structure
180 * is shared between the parent and child).
181 *
182 * Since these structures are populated once, during the discovery
183 * phase of image construction, they are effectively immutable so
184 * we make no effort to synchronize access to them.
185 *
186 * Note that code herein does not assume the image name is known (it
187 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500188 */
189struct rbd_spec {
190 u64 pool_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500191 const char *pool_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500192
Alex Elderecb4dc22013-04-26 09:43:47 -0500193 const char *image_id;
194 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500195
196 u64 snap_id;
Alex Elderecb4dc22013-04-26 09:43:47 -0500197 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500198
199 struct kref kref;
200};
201
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700202/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600203 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700204 */
205struct rbd_client {
206 struct ceph_client *client;
207 struct kref kref;
208 struct list_head node;
209};
210
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211struct rbd_img_request;
212typedef void (*rbd_img_callback_t)(struct rbd_img_request *);
213
214#define BAD_WHICH U32_MAX /* Good which or bad which, which? */
215
216struct rbd_obj_request;
217typedef void (*rbd_obj_callback_t)(struct rbd_obj_request *);
218
Alex Elder9969ebc2013-01-18 12:31:10 -0600219enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100220 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100221 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100222 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Alex Elder9969ebc2013-01-18 12:31:10 -0600223};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600224
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100226 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800227 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800228 OBJ_OP_DISCARD,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800229};
230
Alex Elder926f9b32013-02-11 12:33:24 -0600231enum obj_req_flags {
232 OBJ_REQ_DONE, /* completion flag: not done = 0, done = 1 */
Alex Elder6365d332013-02-11 12:33:24 -0600233 OBJ_REQ_IMG_DATA, /* object usage: standalone = 0, image = 1 */
Alex Elder5679c592013-02-11 12:33:24 -0600234 OBJ_REQ_KNOWN, /* EXISTS flag valid: no = 0, yes = 1 */
235 OBJ_REQ_EXISTS, /* target exists: no = 0, yes = 1 */
Alex Elder926f9b32013-02-11 12:33:24 -0600236};
237
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100238/*
239 * Writes go through the following state machine to deal with
240 * layering:
241 *
242 * need copyup
243 * RBD_OBJ_WRITE_GUARD ---------------> RBD_OBJ_WRITE_COPYUP
244 * | ^ |
245 * v \------------------------------/
246 * done
247 * ^
248 * |
249 * RBD_OBJ_WRITE_FLAT
250 *
251 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
252 * there is a parent or not.
253 */
254enum rbd_obj_write_state {
255 RBD_OBJ_WRITE_FLAT = 1,
256 RBD_OBJ_WRITE_GUARD,
257 RBD_OBJ_WRITE_COPYUP,
258};
259
Alex Elderbf0d5f502012-11-22 00:00:08 -0600260struct rbd_obj_request {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +0100261 u64 object_no;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600262 u64 offset; /* object start byte */
263 u64 length; /* bytes from offset */
Alex Elder926f9b32013-02-11 12:33:24 -0600264 unsigned long flags;
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100265 union {
266 bool tried_parent; /* for reads */
267 enum rbd_obj_write_state write_state; /* for writes */
268 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269
Alex Elderc5b5ef62013-02-11 12:33:24 -0600270 /*
271 * An object request associated with an image will have its
272 * img_data flag set; a standalone object request will not.
273 *
274 * A standalone object request will have which == BAD_WHICH
275 * and a null obj_request pointer.
276 *
277 * An object request initiated in support of a layered image
278 * object (to check for its existence before a write) will
279 * have which == BAD_WHICH and a non-null obj_request pointer.
280 *
281 * Finally, an object request for rbd image data will have
282 * which != BAD_WHICH, and will have a non-null img_request
283 * pointer. The value of which will be in the range
284 * 0..(img_request->obj_request_count-1).
285 */
286 union {
287 struct rbd_obj_request *obj_request; /* STAT op */
288 struct {
289 struct rbd_img_request *img_request;
290 u64 img_offset;
291 /* links for img_request->obj_requests list */
292 struct list_head links;
293 };
294 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295 u32 which; /* posn image request list */
296
297 enum obj_request_type type;
Alex Elder788e2df2013-01-17 12:25:27 -0600298 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100299 struct ceph_bio_iter bio_pos;
Alex Elder788e2df2013-01-17 12:25:27 -0600300 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100301 struct ceph_bvec_iter bvec_pos;
302 u32 bvec_count;
Alex Elder788e2df2013-01-17 12:25:27 -0600303 };
304 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100305 struct bio_vec *copyup_bvecs;
306 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600307
308 struct ceph_osd_request *osd_req;
309
310 u64 xferred; /* bytes transferred */
Sage Weil1b83bef2013-02-25 16:11:12 -0800311 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600312
313 rbd_obj_callback_t callback;
314
315 struct kref kref;
316};
317
Alex Elder0c425242013-02-08 09:55:49 -0600318enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600319 IMG_REQ_WRITE, /* I/O direction: read = 0, write = 1 */
320 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600321 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800322 IMG_REQ_DISCARD, /* discard: normal = 0, discard request = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600323};
324
Alex Elderbf0d5f502012-11-22 00:00:08 -0600325struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600326 struct rbd_device *rbd_dev;
327 u64 offset; /* starting image byte offset */
328 u64 length; /* byte count from offset */
Alex Elder0c425242013-02-08 09:55:49 -0600329 unsigned long flags;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600330 union {
Alex Elder9849e982013-01-24 16:13:36 -0600331 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600332 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600333 };
334 union {
335 struct request *rq; /* block request */
336 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600337 };
338 spinlock_t completion_lock;/* protects next_completion */
339 u32 next_completion;
340 rbd_img_callback_t callback;
Alex Elder55f27e02013-04-10 12:34:25 -0500341 u64 xferred;/* aggregate bytes transferred */
Alex Eldera5a337d2013-01-24 16:13:36 -0600342 int result; /* first nonzero obj_request result */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600343
344 u32 obj_request_count;
345 struct list_head obj_requests; /* rbd_obj_request structs */
346
347 struct kref kref;
348};
349
350#define for_each_obj_request(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600351 list_for_each_entry(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600352#define for_each_obj_request_from(ireq, oreq) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600353 list_for_each_entry_from(oreq, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600354#define for_each_obj_request_safe(ireq, oreq, n) \
Alex Elderef06f4d32013-02-08 09:55:48 -0600355 list_for_each_entry_safe_reverse(oreq, n, &(ireq)->obj_requests, links)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600356
Ilya Dryomov99d16942016-08-12 16:11:41 +0200357enum rbd_watch_state {
358 RBD_WATCH_STATE_UNREGISTERED,
359 RBD_WATCH_STATE_REGISTERED,
360 RBD_WATCH_STATE_ERROR,
361};
362
Ilya Dryomoved95b212016-08-12 16:40:02 +0200363enum rbd_lock_state {
364 RBD_LOCK_STATE_UNLOCKED,
365 RBD_LOCK_STATE_LOCKED,
366 RBD_LOCK_STATE_RELEASING,
367};
368
369/* WatchNotify::ClientId */
370struct rbd_client_id {
371 u64 gid;
372 u64 handle;
373};
374
Alex Elderf84344f2012-08-31 17:29:51 -0500375struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500376 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500377 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500378};
379
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700380/*
381 * a single device
382 */
383struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500384 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700385
386 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200387 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700388 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700389
Alex Eldera30b71b2012-07-10 20:30:11 -0500390 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700391 struct rbd_client *rbd_client;
392
393 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
394
Alex Elderb82d1672013-01-14 12:43:31 -0600395 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700396
397 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600398 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500399 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300400 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200401 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700402
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200403 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200404 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500405
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200406 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600407
Ilya Dryomov99d16942016-08-12 16:11:41 +0200408 struct mutex watch_mutex;
409 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200410 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200411 u64 watch_cookie;
412 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700413
Ilya Dryomoved95b212016-08-12 16:40:02 +0200414 struct rw_semaphore lock_rwsem;
415 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200416 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200417 struct rbd_client_id owner_cid;
418 struct work_struct acquired_lock_work;
419 struct work_struct released_lock_work;
420 struct delayed_work lock_dwork;
421 struct work_struct unlock_work;
422 wait_queue_head_t lock_waitq;
423
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200424 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700425
Alex Elder86b00e02012-10-25 23:34:42 -0500426 struct rbd_spec *parent_spec;
427 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500428 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500429 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500430
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100431 /* Block layer tags. */
432 struct blk_mq_tag_set tag_set;
433
Josh Durginc6666012011-11-21 17:11:12 -0800434 /* protects updating the header */
435 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500436
437 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700438
439 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800440
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800441 /* sysfs related */
442 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600443 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800444};
445
Alex Elderb82d1672013-01-14 12:43:31 -0600446/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200447 * Flag bits for rbd_dev->flags:
448 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
449 * by rbd_dev->lock
450 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600451 */
Alex Elder6d292902013-01-14 12:43:31 -0600452enum rbd_dev_flags {
453 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600454 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200455 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600456};
457
Alex Eldercfbf6372013-05-31 17:40:45 -0500458static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600459
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700460static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600461static DEFINE_SPINLOCK(rbd_dev_list_lock);
462
Alex Elder432b8582012-01-29 13:57:44 -0600463static LIST_HEAD(rbd_client_list); /* clients */
464static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700465
Alex Elder78c2a442013-05-01 12:43:04 -0500466/* Slab caches for frequently-allocated structures */
467
Alex Elder1c2a9df2013-05-01 12:43:03 -0500468static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500469static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500470
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200471static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200472static DEFINE_IDA(rbd_dev_id_ida);
473
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400474static struct workqueue_struct *rbd_wq;
475
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200476/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100477 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200478 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100479static bool single_major = true;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200480module_param(single_major, bool, S_IRUGO);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100481MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200482
Alex Elder3d7efd12013-04-19 15:34:50 -0500483static int rbd_img_request_submit(struct rbd_img_request *img_request);
484
Alex Elderf0f8cef2012-01-29 13:57:44 -0600485static ssize_t rbd_add(struct bus_type *bus, const char *buf,
486 size_t count);
487static ssize_t rbd_remove(struct bus_type *bus, const char *buf,
488 size_t count);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200489static ssize_t rbd_add_single_major(struct bus_type *bus, const char *buf,
490 size_t count);
491static ssize_t rbd_remove_single_major(struct bus_type *bus, const char *buf,
492 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200493static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Eldera2acd002013-05-08 22:50:04 -0500494static void rbd_spec_put(struct rbd_spec *spec);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600495
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200496static int rbd_dev_id_to_minor(int dev_id)
497{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200498 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200499}
500
501static int minor_to_rbd_dev_id(int minor)
502{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200503 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200504}
505
Ilya Dryomoved95b212016-08-12 16:40:02 +0200506static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
507{
508 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
509 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
510}
511
512static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
513{
514 bool is_lock_owner;
515
516 down_read(&rbd_dev->lock_rwsem);
517 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
518 up_read(&rbd_dev->lock_rwsem);
519 return is_lock_owner;
520}
521
Ilya Dryomov8767b292017-03-02 19:56:57 +0100522static ssize_t rbd_supported_features_show(struct bus_type *bus, char *buf)
523{
524 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
525}
526
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700527static BUS_ATTR(add, S_IWUSR, NULL, rbd_add);
528static BUS_ATTR(remove, S_IWUSR, NULL, rbd_remove);
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200529static BUS_ATTR(add_single_major, S_IWUSR, NULL, rbd_add_single_major);
530static BUS_ATTR(remove_single_major, S_IWUSR, NULL, rbd_remove_single_major);
Ilya Dryomov8767b292017-03-02 19:56:57 +0100531static BUS_ATTR(supported_features, S_IRUGO, rbd_supported_features_show, NULL);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700532
533static struct attribute *rbd_bus_attrs[] = {
534 &bus_attr_add.attr,
535 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200536 &bus_attr_add_single_major.attr,
537 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100538 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700539 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600540};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200541
542static umode_t rbd_bus_is_visible(struct kobject *kobj,
543 struct attribute *attr, int index)
544{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200545 if (!single_major &&
546 (attr == &bus_attr_add_single_major.attr ||
547 attr == &bus_attr_remove_single_major.attr))
548 return 0;
549
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200550 return attr->mode;
551}
552
553static const struct attribute_group rbd_bus_group = {
554 .attrs = rbd_bus_attrs,
555 .is_visible = rbd_bus_is_visible,
556};
557__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600558
559static struct bus_type rbd_bus_type = {
560 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700561 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600562};
563
564static void rbd_root_dev_release(struct device *dev)
565{
566}
567
568static struct device rbd_root_dev = {
569 .init_name = "rbd",
570 .release = rbd_root_dev_release,
571};
572
Alex Elder06ecc6c2012-11-01 10:17:15 -0500573static __printf(2, 3)
574void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
575{
576 struct va_format vaf;
577 va_list args;
578
579 va_start(args, fmt);
580 vaf.fmt = fmt;
581 vaf.va = &args;
582
583 if (!rbd_dev)
584 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
585 else if (rbd_dev->disk)
586 printk(KERN_WARNING "%s: %s: %pV\n",
587 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
588 else if (rbd_dev->spec && rbd_dev->spec->image_name)
589 printk(KERN_WARNING "%s: image %s: %pV\n",
590 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
591 else if (rbd_dev->spec && rbd_dev->spec->image_id)
592 printk(KERN_WARNING "%s: id %s: %pV\n",
593 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
594 else /* punt */
595 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
596 RBD_DRV_NAME, rbd_dev, &vaf);
597 va_end(args);
598}
599
Alex Elderaafb2302012-09-06 16:00:54 -0500600#ifdef RBD_DEBUG
601#define rbd_assert(expr) \
602 if (unlikely(!(expr))) { \
603 printk(KERN_ERR "\nAssertion failure in %s() " \
604 "at line %d:\n\n" \
605 "\trbd_assert(%s);\n\n", \
606 __func__, __LINE__, #expr); \
607 BUG(); \
608 }
609#else /* !RBD_DEBUG */
610# define rbd_assert(expr) ((void) 0)
611#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800612
Ilya Dryomov27617132015-07-16 17:36:11 +0300613static void rbd_osd_copyup_callback(struct rbd_obj_request *obj_request);
Alex Elderb454e362013-04-19 15:34:50 -0500614static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request);
Alex Elder05a46af2013-04-26 15:44:36 -0500615static void rbd_img_parent_read(struct rbd_obj_request *obj_request);
616static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600617
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500618static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500619static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400620static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400621static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500622static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
623 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500624static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
625 u8 *order, u64 *snap_size);
626static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
627 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700628
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700629static int rbd_open(struct block_device *bdev, fmode_t mode)
630{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600631 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600632 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700633
Alex Eldera14ea262013-02-05 13:23:12 -0600634 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600635 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
636 removing = true;
637 else
638 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600639 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600640 if (removing)
641 return -ENOENT;
642
Alex Elderc3e946c2012-11-16 09:29:16 -0600643 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700644
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700645 return 0;
646}
647
Al Virodb2a1442013-05-05 21:52:57 -0400648static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800649{
650 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600651 unsigned long open_count_before;
652
Alex Eldera14ea262013-02-05 13:23:12 -0600653 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600654 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600655 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600656 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800657
Alex Elderc3e946c2012-11-16 09:29:16 -0600658 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800659}
660
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800661static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
662{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200663 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800664
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200665 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800666 return -EFAULT;
667
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200668 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800669 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
670 return -EROFS;
671
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200672 /* Let blkdev_roset() handle it */
673 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800674}
675
676static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
677 unsigned int cmd, unsigned long arg)
678{
679 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200680 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800681
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800682 switch (cmd) {
683 case BLKROSET:
684 ret = rbd_ioctl_set_ro(rbd_dev, arg);
685 break;
686 default:
687 ret = -ENOTTY;
688 }
689
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800690 return ret;
691}
692
693#ifdef CONFIG_COMPAT
694static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
695 unsigned int cmd, unsigned long arg)
696{
697 return rbd_ioctl(bdev, mode, cmd, arg);
698}
699#endif /* CONFIG_COMPAT */
700
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700701static const struct block_device_operations rbd_bd_ops = {
702 .owner = THIS_MODULE,
703 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800704 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800705 .ioctl = rbd_ioctl,
706#ifdef CONFIG_COMPAT
707 .compat_ioctl = rbd_compat_ioctl,
708#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700709};
710
711/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500712 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500713 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700714 */
Alex Elderf8c38922012-08-10 13:12:07 -0700715static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700716{
717 struct rbd_client *rbdc;
718 int ret = -ENOMEM;
719
Alex Elder37206ee2013-02-20 17:32:08 -0600720 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700721 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
722 if (!rbdc)
723 goto out_opt;
724
725 kref_init(&rbdc->kref);
726 INIT_LIST_HEAD(&rbdc->node);
727
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100728 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500730 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500731 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700732
733 ret = ceph_open_session(rbdc->client);
734 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500735 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700736
Alex Elder432b8582012-01-29 13:57:44 -0600737 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700738 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600739 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740
Alex Elder37206ee2013-02-20 17:32:08 -0600741 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600742
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700743 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500744out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700745 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500746out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700747 kfree(rbdc);
748out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500749 if (ceph_opts)
750 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600751 dout("%s: error %d\n", __func__, ret);
752
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400753 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700754}
755
Alex Elder2f82ee52012-10-30 19:40:33 -0500756static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
757{
758 kref_get(&rbdc->kref);
759
760 return rbdc;
761}
762
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700763/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700764 * Find a ceph client with specific addr and configuration. If
765 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700766 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700767static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768{
769 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700770 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700771
Alex Elder43ae4702012-07-03 16:01:18 -0500772 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700773 return NULL;
774
Alex Elder1f7ba332012-08-10 13:12:07 -0700775 spin_lock(&rbd_client_list_lock);
776 list_for_each_entry(client_node, &rbd_client_list, node) {
777 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500778 __rbd_get_client(client_node);
779
Alex Elder1f7ba332012-08-10 13:12:07 -0700780 found = true;
781 break;
782 }
783 }
784 spin_unlock(&rbd_client_list_lock);
785
786 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700787}
788
789/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300790 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700791 */
792enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300793 Opt_queue_depth,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700794 Opt_last_int,
795 /* int args above */
796 Opt_last_string,
797 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700798 Opt_read_only,
799 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200800 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200801 Opt_exclusive,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300802 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700803};
804
Alex Elder43ae4702012-07-03 16:01:18 -0500805static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300806 {Opt_queue_depth, "queue_depth=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700807 /* int args above */
808 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500809 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700810 {Opt_read_only, "ro"}, /* Alternate spelling */
811 {Opt_read_write, "read_write"},
812 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200813 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200814 {Opt_exclusive, "exclusive"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300815 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700816};
817
Alex Elder98571b52013-01-20 14:44:42 -0600818struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300819 int queue_depth;
Alex Elder98571b52013-01-20 14:44:42 -0600820 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200821 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200822 bool exclusive;
Alex Elder98571b52013-01-20 14:44:42 -0600823};
824
Ilya Dryomovb5584182015-06-23 16:21:19 +0300825#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Alex Elder98571b52013-01-20 14:44:42 -0600826#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200827#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200828#define RBD_EXCLUSIVE_DEFAULT false
Alex Elder98571b52013-01-20 14:44:42 -0600829
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700830static int parse_rbd_opts_token(char *c, void *private)
831{
Alex Elder43ae4702012-07-03 16:01:18 -0500832 struct rbd_options *rbd_opts = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700833 substring_t argstr[MAX_OPT_ARGS];
834 int token, intval, ret;
835
Alex Elder43ae4702012-07-03 16:01:18 -0500836 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700837 if (token < Opt_last_int) {
838 ret = match_int(&argstr[0], &intval);
839 if (ret < 0) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300840 pr_err("bad mount option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700841 return ret;
842 }
843 dout("got int token %d val %d\n", token, intval);
844 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300845 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700846 } else {
847 dout("got token %d\n", token);
848 }
849
850 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300851 case Opt_queue_depth:
852 if (intval < 1) {
853 pr_err("queue_depth out of range\n");
854 return -EINVAL;
855 }
856 rbd_opts->queue_depth = intval;
857 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700858 case Opt_read_only:
859 rbd_opts->read_only = true;
860 break;
861 case Opt_read_write:
862 rbd_opts->read_only = false;
863 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200864 case Opt_lock_on_read:
865 rbd_opts->lock_on_read = true;
866 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200867 case Opt_exclusive:
868 rbd_opts->exclusive = true;
869 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700870 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300871 /* libceph prints "bad option" msg */
872 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700873 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300874
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700875 return 0;
876}
877
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800878static char* obj_op_name(enum obj_operation_type op_type)
879{
880 switch (op_type) {
881 case OBJ_OP_READ:
882 return "read";
883 case OBJ_OP_WRITE:
884 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800885 case OBJ_OP_DISCARD:
886 return "discard";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800887 default:
888 return "???";
889 }
890}
891
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700892/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700893 * Get a ceph client with specific addr and configuration, if one does
Alex Elder7262cfc2013-05-16 15:04:20 -0500894 * not exist create it. Either way, ceph_opts is consumed by this
895 * function.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700896 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500897static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700898{
Alex Elderf8c38922012-08-10 13:12:07 -0700899 struct rbd_client *rbdc;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700900
Alex Eldercfbf6372013-05-31 17:40:45 -0500901 mutex_lock_nested(&client_mutex, SINGLE_DEPTH_NESTING);
Alex Elder1f7ba332012-08-10 13:12:07 -0700902 rbdc = rbd_client_find(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500903 if (rbdc) /* using an existing client */
Alex Elder43ae4702012-07-03 16:01:18 -0500904 ceph_destroy_options(ceph_opts);
Alex Elder9d3997f2012-10-25 23:34:42 -0500905 else
Alex Elderf8c38922012-08-10 13:12:07 -0700906 rbdc = rbd_client_create(ceph_opts);
Alex Eldercfbf6372013-05-31 17:40:45 -0500907 mutex_unlock(&client_mutex);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700908
Alex Elder9d3997f2012-10-25 23:34:42 -0500909 return rbdc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700910}
911
912/*
913 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600914 *
Alex Elder432b8582012-01-29 13:57:44 -0600915 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700916 */
917static void rbd_client_release(struct kref *kref)
918{
919 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
920
Alex Elder37206ee2013-02-20 17:32:08 -0600921 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500922 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700923 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500924 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700925
926 ceph_destroy_client(rbdc->client);
927 kfree(rbdc);
928}
929
930/*
931 * Drop reference to ceph client node. If it's not referenced anymore, release
932 * it.
933 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500934static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700935{
Alex Elderc53d5892012-10-25 23:34:42 -0500936 if (rbdc)
937 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700938}
939
Alex Eldera30b71b2012-07-10 20:30:11 -0500940static bool rbd_image_format_valid(u32 image_format)
941{
942 return image_format == 1 || image_format == 2;
943}
944
Alex Elder8e94af82012-07-25 09:32:40 -0500945static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
946{
Alex Elder103a1502012-08-02 11:29:45 -0500947 size_t size;
948 u32 snap_count;
949
950 /* The header has to start with the magic rbd header text */
951 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
952 return false;
953
Alex Elderdb2388b2012-10-20 22:17:27 -0500954 /* The bio layer requires at least sector-sized I/O */
955
956 if (ondisk->options.order < SECTOR_SHIFT)
957 return false;
958
959 /* If we use u64 in a few spots we may be able to loosen this */
960
961 if (ondisk->options.order > 8 * sizeof (int) - 1)
962 return false;
963
Alex Elder103a1502012-08-02 11:29:45 -0500964 /*
965 * The size of a snapshot header has to fit in a size_t, and
966 * that limits the number of snapshots.
967 */
968 snap_count = le32_to_cpu(ondisk->snap_count);
969 size = SIZE_MAX - sizeof (struct ceph_snap_context);
970 if (snap_count > size / sizeof (__le64))
971 return false;
972
973 /*
974 * Not only that, but the size of the entire the snapshot
975 * header must also be representable in a size_t.
976 */
977 size -= snap_count * sizeof (__le64);
978 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
979 return false;
980
981 return true;
Alex Elder8e94af82012-07-25 09:32:40 -0500982}
983
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700984/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +0100985 * returns the size of an object in the image
986 */
987static u32 rbd_obj_bytes(struct rbd_image_header *header)
988{
989 return 1U << header->obj_order;
990}
991
Ilya Dryomov263423f2017-01-25 18:16:22 +0100992static void rbd_init_layout(struct rbd_device *rbd_dev)
993{
994 if (rbd_dev->header.stripe_unit == 0 ||
995 rbd_dev->header.stripe_count == 0) {
996 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
997 rbd_dev->header.stripe_count = 1;
998 }
999
1000 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1001 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1002 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001003 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1004 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001005 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1006}
1007
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001008/*
Alex Elderbb23e372013-05-06 09:51:29 -05001009 * Fill an rbd image header with information from the given format 1
1010 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001011 */
Alex Elder662518b2013-05-06 09:51:29 -05001012static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001013 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001014{
Alex Elder662518b2013-05-06 09:51:29 -05001015 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001016 bool first_time = header->object_prefix == NULL;
1017 struct ceph_snap_context *snapc;
1018 char *object_prefix = NULL;
1019 char *snap_names = NULL;
1020 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001021 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001022 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001023 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001024
Alex Elderbb23e372013-05-06 09:51:29 -05001025 /* Allocate this now to avoid having to handle failure below */
1026
1027 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001028 object_prefix = kstrndup(ondisk->object_prefix,
1029 sizeof(ondisk->object_prefix),
1030 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001031 if (!object_prefix)
1032 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001033 }
1034
1035 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001036
Alex Elder103a1502012-08-02 11:29:45 -05001037 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001038 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1039 if (!snapc)
1040 goto out_err;
1041 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001042 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001043 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001044 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1045
Alex Elderbb23e372013-05-06 09:51:29 -05001046 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001047
Alex Elderbb23e372013-05-06 09:51:29 -05001048 if (snap_names_len > (u64)SIZE_MAX)
1049 goto out_2big;
1050 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1051 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001052 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001053
1054 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001055 snap_sizes = kmalloc_array(snap_count,
1056 sizeof(*header->snap_sizes),
1057 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001058 if (!snap_sizes)
1059 goto out_err;
1060
Alex Elderf785cc12012-08-23 23:22:06 -05001061 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001062 * Copy the names, and fill in each snapshot's id
1063 * and size.
1064 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001065 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001066 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001067 * snap_names_len bytes beyond the end of the
1068 * snapshot id array, this memcpy() is safe.
1069 */
Alex Elderbb23e372013-05-06 09:51:29 -05001070 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1071 snaps = ondisk->snaps;
1072 for (i = 0; i < snap_count; i++) {
1073 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1074 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1075 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001076 }
Alex Elder849b4262012-07-09 21:04:24 -05001077
Alex Elderbb23e372013-05-06 09:51:29 -05001078 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001079
Alex Elderbb23e372013-05-06 09:51:29 -05001080 if (first_time) {
1081 header->object_prefix = object_prefix;
1082 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001083 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001084 } else {
1085 ceph_put_snap_context(header->snapc);
1086 kfree(header->snap_names);
1087 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001088 }
1089
1090 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001091
Alex Elderf84344f2012-08-31 17:29:51 -05001092 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001093 header->snapc = snapc;
1094 header->snap_names = snap_names;
1095 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001096
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001097 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001098out_2big:
1099 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001100out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001101 kfree(snap_sizes);
1102 kfree(snap_names);
1103 ceph_put_snap_context(snapc);
1104 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001105
Alex Elderbb23e372013-05-06 09:51:29 -05001106 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001107}
1108
Alex Elder9682fc62013-04-30 00:44:33 -05001109static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1110{
1111 const char *snap_name;
1112
1113 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1114
1115 /* Skip over names until we find the one we are looking for */
1116
1117 snap_name = rbd_dev->header.snap_names;
1118 while (which--)
1119 snap_name += strlen(snap_name) + 1;
1120
1121 return kstrdup(snap_name, GFP_KERNEL);
1122}
1123
Alex Elder30d1cff2013-05-01 12:43:03 -05001124/*
1125 * Snapshot id comparison function for use with qsort()/bsearch().
1126 * Note that result is for snapshots in *descending* order.
1127 */
1128static int snapid_compare_reverse(const void *s1, const void *s2)
1129{
1130 u64 snap_id1 = *(u64 *)s1;
1131 u64 snap_id2 = *(u64 *)s2;
1132
1133 if (snap_id1 < snap_id2)
1134 return 1;
1135 return snap_id1 == snap_id2 ? 0 : -1;
1136}
1137
1138/*
1139 * Search a snapshot context to see if the given snapshot id is
1140 * present.
1141 *
1142 * Returns the position of the snapshot id in the array if it's found,
1143 * or BAD_SNAP_INDEX otherwise.
1144 *
1145 * Note: The snapshot array is in kept sorted (by the osd) in
1146 * reverse order, highest snapshot id first.
1147 */
Alex Elder9682fc62013-04-30 00:44:33 -05001148static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1149{
1150 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001151 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001152
Alex Elder30d1cff2013-05-01 12:43:03 -05001153 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1154 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001155
Alex Elder30d1cff2013-05-01 12:43:03 -05001156 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001157}
1158
Alex Elder2ad3d712013-04-30 00:44:33 -05001159static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1160 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001161{
1162 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001163 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001164
1165 which = rbd_dev_snap_index(rbd_dev, snap_id);
1166 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001167 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001168
Josh Durginda6a6b62013-09-04 17:57:31 -07001169 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1170 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001171}
1172
Alex Elder9e15b772012-10-30 19:40:33 -05001173static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1174{
Alex Elder9e15b772012-10-30 19:40:33 -05001175 if (snap_id == CEPH_NOSNAP)
1176 return RBD_SNAP_HEAD_NAME;
1177
Alex Elder54cac612013-04-30 00:44:33 -05001178 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1179 if (rbd_dev->image_format == 1)
1180 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001181
Alex Elder54cac612013-04-30 00:44:33 -05001182 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001183}
1184
Alex Elder2ad3d712013-04-30 00:44:33 -05001185static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1186 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001187{
Alex Elder2ad3d712013-04-30 00:44:33 -05001188 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1189 if (snap_id == CEPH_NOSNAP) {
1190 *snap_size = rbd_dev->header.image_size;
1191 } else if (rbd_dev->image_format == 1) {
1192 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001193
Alex Elder2ad3d712013-04-30 00:44:33 -05001194 which = rbd_dev_snap_index(rbd_dev, snap_id);
1195 if (which == BAD_SNAP_INDEX)
1196 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001197
Alex Elder2ad3d712013-04-30 00:44:33 -05001198 *snap_size = rbd_dev->header.snap_sizes[which];
1199 } else {
1200 u64 size = 0;
1201 int ret;
1202
1203 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1204 if (ret)
1205 return ret;
1206
1207 *snap_size = size;
1208 }
1209 return 0;
1210}
1211
1212static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1213 u64 *snap_features)
1214{
1215 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1216 if (snap_id == CEPH_NOSNAP) {
1217 *snap_features = rbd_dev->header.features;
1218 } else if (rbd_dev->image_format == 1) {
1219 *snap_features = 0; /* No features for format 1 */
1220 } else {
1221 u64 features = 0;
1222 int ret;
1223
1224 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1225 if (ret)
1226 return ret;
1227
1228 *snap_features = features;
1229 }
1230 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001231}
1232
Alex Elderd1cf5782013-04-27 09:59:30 -05001233static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001234{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001235 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001236 u64 size = 0;
1237 u64 features = 0;
1238 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001239
Alex Elder2ad3d712013-04-30 00:44:33 -05001240 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1241 if (ret)
1242 return ret;
1243 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1244 if (ret)
1245 return ret;
1246
1247 rbd_dev->mapping.size = size;
1248 rbd_dev->mapping.features = features;
1249
Alex Elder8b0241f2013-04-25 23:15:08 -05001250 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001251}
1252
Alex Elderd1cf5782013-04-27 09:59:30 -05001253static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1254{
1255 rbd_dev->mapping.size = 0;
1256 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001257}
1258
Alex Elder65ccfe22012-08-09 10:33:26 -07001259static u64 rbd_segment_offset(struct rbd_device *rbd_dev, u64 offset)
1260{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001261 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001262
Alex Elder65ccfe22012-08-09 10:33:26 -07001263 return offset & (segment_size - 1);
1264}
1265
1266static u64 rbd_segment_length(struct rbd_device *rbd_dev,
1267 u64 offset, u64 length)
1268{
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001269 u64 segment_size = rbd_obj_bytes(&rbd_dev->header);
Alex Elder65ccfe22012-08-09 10:33:26 -07001270
1271 offset &= segment_size - 1;
1272
Alex Elderaafb2302012-09-06 16:00:54 -05001273 rbd_assert(length <= U64_MAX - offset);
Alex Elder65ccfe22012-08-09 10:33:26 -07001274 if (offset + length > segment_size)
1275 length = segment_size - offset;
1276
1277 return length;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001278}
1279
Ilya Dryomov5359a172018-01-20 10:30:10 +01001280static void zero_bvec(struct bio_vec *bv)
1281{
1282 void *buf;
1283 unsigned long flags;
1284
1285 buf = bvec_kmap_irq(bv, &flags);
1286 memset(buf, 0, bv->bv_len);
1287 flush_dcache_page(bv->bv_page);
1288 bvec_kunmap_irq(buf, &flags);
1289}
1290
1291static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
1292{
1293 struct ceph_bio_iter it = *bio_pos;
1294
1295 ceph_bio_iter_advance(&it, off);
1296 ceph_bio_iter_advance_step(&it, bytes, ({
1297 zero_bvec(&bv);
1298 }));
1299}
1300
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001301static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001302{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001303 struct ceph_bvec_iter it = *bvec_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001304
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001305 ceph_bvec_iter_advance(&it, off);
1306 ceph_bvec_iter_advance_step(&it, bytes, ({
1307 zero_bvec(&bv);
1308 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001309}
1310
1311/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001312 * Zero a range in @obj_req data buffer defined by a bio (list) or
1313 * bio_vec array.
1314 *
1315 * @off is relative to the start of the data buffer.
1316 */
1317static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1318 u32 bytes)
1319{
1320 switch (obj_req->type) {
1321 case OBJ_REQUEST_BIO:
1322 zero_bios(&obj_req->bio_pos, off, bytes);
1323 break;
1324 case OBJ_REQUEST_BVECS:
1325 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1326 break;
1327 default:
1328 rbd_assert(0);
1329 }
1330}
1331
1332/*
Alex Elder926f9b32013-02-11 12:33:24 -06001333 * The default/initial value for all object request flags is 0. For
1334 * each flag, once its value is set to 1 it is never reset to 0
1335 * again.
1336 */
Alex Elder6365d332013-02-11 12:33:24 -06001337static void obj_request_img_data_set(struct rbd_obj_request *obj_request)
1338{
1339 if (test_and_set_bit(OBJ_REQ_IMG_DATA, &obj_request->flags)) {
Alex Elder6365d332013-02-11 12:33:24 -06001340 struct rbd_device *rbd_dev;
1341
Alex Elder57acbaa2013-02-11 12:33:24 -06001342 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001343 rbd_warn(rbd_dev, "obj_request %p already marked img_data",
Alex Elder6365d332013-02-11 12:33:24 -06001344 obj_request);
1345 }
1346}
1347
1348static bool obj_request_img_data_test(struct rbd_obj_request *obj_request)
1349{
1350 smp_mb();
1351 return test_bit(OBJ_REQ_IMG_DATA, &obj_request->flags) != 0;
1352}
1353
Alex Elder57acbaa2013-02-11 12:33:24 -06001354static void obj_request_done_set(struct rbd_obj_request *obj_request)
1355{
1356 if (test_and_set_bit(OBJ_REQ_DONE, &obj_request->flags)) {
1357 struct rbd_device *rbd_dev = NULL;
1358
1359 if (obj_request_img_data_test(obj_request))
1360 rbd_dev = obj_request->img_request->rbd_dev;
Ilya Dryomov9584d502014-07-11 12:11:20 +04001361 rbd_warn(rbd_dev, "obj_request %p already marked done",
Alex Elder57acbaa2013-02-11 12:33:24 -06001362 obj_request);
1363 }
1364}
1365
1366static bool obj_request_done_test(struct rbd_obj_request *obj_request)
1367{
1368 smp_mb();
1369 return test_bit(OBJ_REQ_DONE, &obj_request->flags) != 0;
1370}
1371
Alex Elder5679c592013-02-11 12:33:24 -06001372/*
1373 * This sets the KNOWN flag after (possibly) setting the EXISTS
1374 * flag. The latter is set based on the "exists" value provided.
1375 *
1376 * Note that for our purposes once an object exists it never goes
1377 * away again. It's possible that the response from two existence
1378 * checks are separated by the creation of the target object, and
1379 * the first ("doesn't exist") response arrives *after* the second
1380 * ("does exist"). In that case we ignore the second one.
1381 */
1382static void obj_request_existence_set(struct rbd_obj_request *obj_request,
1383 bool exists)
1384{
1385 if (exists)
1386 set_bit(OBJ_REQ_EXISTS, &obj_request->flags);
1387 set_bit(OBJ_REQ_KNOWN, &obj_request->flags);
1388 smp_mb();
1389}
1390
1391static bool obj_request_known_test(struct rbd_obj_request *obj_request)
1392{
1393 smp_mb();
1394 return test_bit(OBJ_REQ_KNOWN, &obj_request->flags) != 0;
1395}
1396
1397static bool obj_request_exists_test(struct rbd_obj_request *obj_request)
1398{
1399 smp_mb();
1400 return test_bit(OBJ_REQ_EXISTS, &obj_request->flags) != 0;
1401}
1402
Ilya Dryomov96385562014-06-10 13:53:29 +04001403static bool obj_request_overlaps_parent(struct rbd_obj_request *obj_request)
1404{
1405 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
1406
1407 return obj_request->img_offset <
1408 round_up(rbd_dev->parent_overlap, rbd_obj_bytes(&rbd_dev->header));
1409}
1410
Alex Elderbf0d5f502012-11-22 00:00:08 -06001411static void rbd_obj_request_get(struct rbd_obj_request *obj_request)
1412{
Alex Elder37206ee2013-02-20 17:32:08 -06001413 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001414 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001415 kref_get(&obj_request->kref);
1416}
1417
1418static void rbd_obj_request_destroy(struct kref *kref);
1419static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1420{
1421 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001422 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001423 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001424 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1425}
1426
Alex Elder0f2d5be2014-04-26 14:21:44 +04001427static void rbd_img_request_get(struct rbd_img_request *img_request)
1428{
1429 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001430 kref_read(&img_request->kref));
Alex Elder0f2d5be2014-04-26 14:21:44 +04001431 kref_get(&img_request->kref);
1432}
1433
Alex Eldere93f3152013-05-08 22:50:04 -05001434static bool img_request_child_test(struct rbd_img_request *img_request);
1435static void rbd_parent_request_destroy(struct kref *kref);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001436static void rbd_img_request_destroy(struct kref *kref);
1437static void rbd_img_request_put(struct rbd_img_request *img_request)
1438{
1439 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001440 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001441 kref_read(&img_request->kref));
Alex Eldere93f3152013-05-08 22:50:04 -05001442 if (img_request_child_test(img_request))
1443 kref_put(&img_request->kref, rbd_parent_request_destroy);
1444 else
1445 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001446}
1447
1448static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1449 struct rbd_obj_request *obj_request)
1450{
Alex Elder25dcf952013-01-25 17:08:55 -06001451 rbd_assert(obj_request->img_request == NULL);
1452
Alex Elderb155e862013-04-15 14:50:37 -05001453 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001454 obj_request->img_request = img_request;
Alex Elder25dcf952013-01-25 17:08:55 -06001455 obj_request->which = img_request->obj_request_count;
Alex Elder6365d332013-02-11 12:33:24 -06001456 rbd_assert(!obj_request_img_data_test(obj_request));
1457 obj_request_img_data_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001458 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001459 img_request->obj_request_count++;
1460 list_add_tail(&obj_request->links, &img_request->obj_requests);
Alex Elder37206ee2013-02-20 17:32:08 -06001461 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1462 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001463}
1464
1465static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1466 struct rbd_obj_request *obj_request)
1467{
1468 rbd_assert(obj_request->which != BAD_WHICH);
Alex Elder25dcf952013-01-25 17:08:55 -06001469
Alex Elder37206ee2013-02-20 17:32:08 -06001470 dout("%s: img %p obj %p w=%u\n", __func__, img_request, obj_request,
1471 obj_request->which);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001472 list_del(&obj_request->links);
Alex Elder25dcf952013-01-25 17:08:55 -06001473 rbd_assert(img_request->obj_request_count > 0);
1474 img_request->obj_request_count--;
1475 rbd_assert(obj_request->which == img_request->obj_request_count);
1476 obj_request->which = BAD_WHICH;
Alex Elder6365d332013-02-11 12:33:24 -06001477 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001478 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001479 obj_request->img_request = NULL;
Alex Elder25dcf952013-01-25 17:08:55 -06001480 obj_request->callback = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001481 rbd_obj_request_put(obj_request);
1482}
1483
1484static bool obj_request_type_valid(enum obj_request_type type)
1485{
1486 switch (type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001487 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001488 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001489 case OBJ_REQUEST_BVECS:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001490 return true;
1491 default:
1492 return false;
1493 }
1494}
1495
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001496static void rbd_img_obj_callback(struct rbd_obj_request *obj_request);
1497
Ilya Dryomov980917f2016-09-12 18:59:42 +02001498static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001499{
Ilya Dryomov980917f2016-09-12 18:59:42 +02001500 struct ceph_osd_request *osd_req = obj_request->osd_req;
1501
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001502 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
1503 obj_request, obj_request->object_no, obj_request->offset,
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001504 obj_request->length, osd_req);
Ilya Dryomov4a17dad2016-09-13 21:08:10 +02001505 if (obj_request_img_data_test(obj_request)) {
1506 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1507 rbd_img_request_get(obj_request->img_request);
1508 }
Ilya Dryomov980917f2016-09-12 18:59:42 +02001509 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001510}
1511
1512static void rbd_img_request_complete(struct rbd_img_request *img_request)
1513{
Alex Elder55f27e02013-04-10 12:34:25 -05001514
Alex Elder37206ee2013-02-20 17:32:08 -06001515 dout("%s: img %p\n", __func__, img_request);
Alex Elder55f27e02013-04-10 12:34:25 -05001516
1517 /*
1518 * If no error occurred, compute the aggregate transfer
1519 * count for the image request. We could instead use
1520 * atomic64_cmpxchg() to update it as each object request
1521 * completes; not clear which way is better off hand.
1522 */
1523 if (!img_request->result) {
1524 struct rbd_obj_request *obj_request;
1525 u64 xferred = 0;
1526
1527 for_each_obj_request(img_request, obj_request)
1528 xferred += obj_request->xferred;
1529 img_request->xferred = xferred;
1530 }
1531
Alex Elderbf0d5f502012-11-22 00:00:08 -06001532 if (img_request->callback)
1533 img_request->callback(img_request);
1534 else
1535 rbd_img_request_put(img_request);
1536}
1537
Alex Elder0c425242013-02-08 09:55:49 -06001538/*
1539 * The default/initial value for all image request flags is 0. Each
1540 * is conditionally set to 1 at image request initialization time
1541 * and currently never change thereafter.
1542 */
1543static void img_request_write_set(struct rbd_img_request *img_request)
1544{
1545 set_bit(IMG_REQ_WRITE, &img_request->flags);
1546 smp_mb();
1547}
1548
1549static bool img_request_write_test(struct rbd_img_request *img_request)
1550{
1551 smp_mb();
1552 return test_bit(IMG_REQ_WRITE, &img_request->flags) != 0;
1553}
1554
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001555/*
1556 * Set the discard flag when the img_request is an discard request
1557 */
1558static void img_request_discard_set(struct rbd_img_request *img_request)
1559{
1560 set_bit(IMG_REQ_DISCARD, &img_request->flags);
1561 smp_mb();
1562}
1563
1564static bool img_request_discard_test(struct rbd_img_request *img_request)
1565{
1566 smp_mb();
1567 return test_bit(IMG_REQ_DISCARD, &img_request->flags) != 0;
1568}
1569
Alex Elder9849e982013-01-24 16:13:36 -06001570static void img_request_child_set(struct rbd_img_request *img_request)
1571{
1572 set_bit(IMG_REQ_CHILD, &img_request->flags);
1573 smp_mb();
1574}
1575
Alex Eldere93f3152013-05-08 22:50:04 -05001576static void img_request_child_clear(struct rbd_img_request *img_request)
1577{
1578 clear_bit(IMG_REQ_CHILD, &img_request->flags);
1579 smp_mb();
1580}
1581
Alex Elder9849e982013-01-24 16:13:36 -06001582static bool img_request_child_test(struct rbd_img_request *img_request)
1583{
1584 smp_mb();
1585 return test_bit(IMG_REQ_CHILD, &img_request->flags) != 0;
1586}
1587
Alex Elderd0b2e942013-01-24 16:13:36 -06001588static void img_request_layered_set(struct rbd_img_request *img_request)
1589{
1590 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1591 smp_mb();
1592}
1593
Alex Eldera2acd002013-05-08 22:50:04 -05001594static void img_request_layered_clear(struct rbd_img_request *img_request)
1595{
1596 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1597 smp_mb();
1598}
1599
Alex Elderd0b2e942013-01-24 16:13:36 -06001600static bool img_request_layered_test(struct rbd_img_request *img_request)
1601{
1602 smp_mb();
1603 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1604}
1605
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001606static enum obj_operation_type
1607rbd_img_request_op_type(struct rbd_img_request *img_request)
1608{
1609 if (img_request_write_test(img_request))
1610 return OBJ_OP_WRITE;
1611 else if (img_request_discard_test(img_request))
1612 return OBJ_OP_DISCARD;
1613 else
1614 return OBJ_OP_READ;
1615}
1616
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001617static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
1618{
1619 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1620
1621 return !obj_req->offset &&
1622 obj_req->length == rbd_dev->layout.object_size;
1623}
1624
1625static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
1626{
1627 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1628
1629 return obj_req->offset + obj_req->length ==
1630 rbd_dev->layout.object_size;
1631}
1632
1633static bool rbd_img_is_write(struct rbd_img_request *img_req)
1634{
1635 switch (rbd_img_request_op_type(img_req)) {
1636 case OBJ_OP_READ:
1637 return false;
1638 case OBJ_OP_WRITE:
1639 case OBJ_OP_DISCARD:
1640 return true;
1641 default:
1642 rbd_assert(0);
1643 }
1644}
1645
Alex Elder6e2a4502013-03-27 09:16:30 -05001646static void
1647rbd_img_obj_request_read_callback(struct rbd_obj_request *obj_request)
1648{
Alex Elderb9434c52013-04-19 15:34:50 -05001649 u64 xferred = obj_request->xferred;
1650 u64 length = obj_request->length;
1651
Alex Elder6e2a4502013-03-27 09:16:30 -05001652 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1653 obj_request, obj_request->img_request, obj_request->result,
Alex Elderb9434c52013-04-19 15:34:50 -05001654 xferred, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001655 /*
Josh Durgin17c1cc12013-08-26 17:55:38 -07001656 * ENOENT means a hole in the image. We zero-fill the entire
1657 * length of the request. A short read also implies zero-fill
1658 * to the end of the request. An error requires the whole
1659 * length of the request to be reported finished with an error
1660 * to the block layer. In each case we update the xferred
1661 * count to indicate the whole request was satisfied.
Alex Elder6e2a4502013-03-27 09:16:30 -05001662 */
Alex Elderb9434c52013-04-19 15:34:50 -05001663 rbd_assert(obj_request->type != OBJ_REQUEST_NODATA);
Alex Elder6e2a4502013-03-27 09:16:30 -05001664 if (obj_request->result == -ENOENT) {
Alex Elderb9434c52013-04-19 15:34:50 -05001665 if (obj_request->type == OBJ_REQUEST_BIO)
Ilya Dryomov5359a172018-01-20 10:30:10 +01001666 zero_bios(&obj_request->bio_pos, 0, length);
Alex Elderb9434c52013-04-19 15:34:50 -05001667 else
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001668 zero_bvecs(&obj_request->bvec_pos, 0, length);
Alex Elder6e2a4502013-03-27 09:16:30 -05001669 obj_request->result = 0;
Alex Elderb9434c52013-04-19 15:34:50 -05001670 } else if (xferred < length && !obj_request->result) {
1671 if (obj_request->type == OBJ_REQUEST_BIO)
Ilya Dryomov5359a172018-01-20 10:30:10 +01001672 zero_bios(&obj_request->bio_pos, xferred,
1673 length - xferred);
Alex Elderb9434c52013-04-19 15:34:50 -05001674 else
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001675 zero_bvecs(&obj_request->bvec_pos, xferred,
1676 length - xferred);
Alex Elder6e2a4502013-03-27 09:16:30 -05001677 }
Josh Durgin17c1cc12013-08-26 17:55:38 -07001678 obj_request->xferred = length;
Alex Elder6e2a4502013-03-27 09:16:30 -05001679 obj_request_done_set(obj_request);
1680}
1681
Alex Elderbf0d5f502012-11-22 00:00:08 -06001682static void rbd_obj_request_complete(struct rbd_obj_request *obj_request)
1683{
Alex Elder37206ee2013-02-20 17:32:08 -06001684 dout("%s: obj %p cb %p\n", __func__, obj_request,
1685 obj_request->callback);
Ilya Dryomov2e584bc2018-01-15 17:24:51 +01001686 obj_request->callback(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001687}
1688
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02001689static void rbd_obj_request_error(struct rbd_obj_request *obj_request, int err)
1690{
1691 obj_request->result = err;
1692 obj_request->xferred = 0;
1693 /*
1694 * kludge - mirror rbd_obj_request_submit() to match a put in
1695 * rbd_img_obj_callback()
1696 */
1697 if (obj_request_img_data_test(obj_request)) {
1698 WARN_ON(obj_request->callback != rbd_img_obj_callback);
1699 rbd_img_request_get(obj_request->img_request);
1700 }
1701 obj_request_done_set(obj_request);
1702 rbd_obj_request_complete(obj_request);
1703}
1704
Alex Elderc47f9372013-02-26 14:23:07 -06001705static void rbd_osd_read_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001706{
Alex Elder57acbaa2013-02-11 12:33:24 -06001707 struct rbd_img_request *img_request = NULL;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001708 struct rbd_device *rbd_dev = NULL;
Alex Elder57acbaa2013-02-11 12:33:24 -06001709 bool layered = false;
1710
1711 if (obj_request_img_data_test(obj_request)) {
1712 img_request = obj_request->img_request;
1713 layered = img_request && img_request_layered_test(img_request);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001714 rbd_dev = img_request->rbd_dev;
Alex Elder57acbaa2013-02-11 12:33:24 -06001715 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06001716
1717 dout("%s: obj %p img %p result %d %llu/%llu\n", __func__,
1718 obj_request, img_request, obj_request->result,
1719 obj_request->xferred, obj_request->length);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05001720 if (layered && obj_request->result == -ENOENT &&
1721 obj_request->img_offset < rbd_dev->parent_overlap)
Alex Elder8b3e1a52013-01-24 16:13:36 -06001722 rbd_img_parent_read(obj_request);
1723 else if (img_request)
Alex Elder6e2a4502013-03-27 09:16:30 -05001724 rbd_img_obj_request_read_callback(obj_request);
1725 else
1726 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001727}
1728
Alex Elderc47f9372013-02-26 14:23:07 -06001729static void rbd_osd_write_callback(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001730{
Sage Weil1b83bef2013-02-25 16:11:12 -08001731 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1732 obj_request->result, obj_request->length);
1733 /*
Alex Elder8b3e1a52013-01-24 16:13:36 -06001734 * There is no such thing as a successful short write. Set
1735 * it to our originally-requested length.
Sage Weil1b83bef2013-02-25 16:11:12 -08001736 */
1737 obj_request->xferred = obj_request->length;
Alex Elder07741302013-02-05 23:41:50 -06001738 obj_request_done_set(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001739}
1740
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001741static void rbd_osd_discard_callback(struct rbd_obj_request *obj_request)
1742{
1743 dout("%s: obj %p result %d %llu\n", __func__, obj_request,
1744 obj_request->result, obj_request->length);
1745 /*
1746 * There is no such thing as a successful short discard. Set
1747 * it to our originally-requested length.
1748 */
1749 obj_request->xferred = obj_request->length;
Josh Durgind0265de2014-04-07 16:54:10 -07001750 /* discarding a non-existent object is not a problem */
1751 if (obj_request->result == -ENOENT)
1752 obj_request->result = 0;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001753 obj_request_done_set(obj_request);
1754}
1755
Alex Elderfbfab532013-02-08 09:55:48 -06001756/*
1757 * For a simple stat call there's nothing to do. We'll do more if
1758 * this is part of a write sequence for a layered image.
1759 */
Alex Elderc47f9372013-02-26 14:23:07 -06001760static void rbd_osd_stat_callback(struct rbd_obj_request *obj_request)
Alex Elderfbfab532013-02-08 09:55:48 -06001761{
Alex Elder37206ee2013-02-20 17:32:08 -06001762 dout("%s: obj %p\n", __func__, obj_request);
Alex Elderfbfab532013-02-08 09:55:48 -06001763 obj_request_done_set(obj_request);
1764}
1765
Ilya Dryomov27617132015-07-16 17:36:11 +03001766static void rbd_osd_call_callback(struct rbd_obj_request *obj_request)
1767{
1768 dout("%s: obj %p\n", __func__, obj_request);
1769
1770 if (obj_request_img_data_test(obj_request))
1771 rbd_osd_copyup_callback(obj_request);
1772 else
1773 obj_request_done_set(obj_request);
1774}
1775
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001776static void rbd_obj_handle_request(struct rbd_obj_request *obj_req);
1777
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001778static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001779{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001780 struct rbd_obj_request *obj_req = osd_req->r_priv;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001781
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001782 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1783 osd_req->r_result, obj_req);
1784 rbd_assert(osd_req == obj_req->osd_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001785
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001786 obj_req->result = osd_req->r_result < 0 ? osd_req->r_result : 0;
1787 if (!obj_req->result && !rbd_img_is_write(obj_req->img_request))
1788 obj_req->xferred = osd_req->r_result;
1789 else
1790 /*
1791 * Writes aren't allowed to return a data payload. In some
1792 * guarded write cases (e.g. stat + zero on an empty object)
1793 * a stat response makes it through, but we don't care.
1794 */
1795 obj_req->xferred = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001796
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001797 rbd_obj_handle_request(obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001798}
1799
Alex Elder9d4df012013-04-19 15:34:50 -05001800static void rbd_osd_req_format_read(struct rbd_obj_request *obj_request)
Alex Elder430c28c2013-04-03 21:32:51 -05001801{
Alex Elder8c042b02013-04-03 01:28:58 -05001802 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder430c28c2013-04-03 21:32:51 -05001803
Ilya Dryomov7c848832016-09-15 17:56:39 +02001804 rbd_assert(obj_request_img_data_test(obj_request));
1805 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001806}
1807
1808static void rbd_osd_req_format_write(struct rbd_obj_request *obj_request)
1809{
Alex Elder9d4df012013-04-19 15:34:50 -05001810 struct ceph_osd_request *osd_req = obj_request->osd_req;
Alex Elder9d4df012013-04-19 15:34:50 -05001811
Deepa Dinamani1134e092017-05-08 15:59:19 -07001812 ktime_get_real_ts(&osd_req->r_mtime);
Ilya Dryomovbb873b5392016-05-26 00:29:52 +02001813 osd_req->r_data_offset = obj_request->offset;
Alex Elder430c28c2013-04-03 21:32:51 -05001814}
1815
Ilya Dryomovbc812072017-01-25 18:16:23 +01001816static struct ceph_osd_request *
1817__rbd_osd_req_create(struct rbd_device *rbd_dev,
1818 struct ceph_snap_context *snapc,
1819 int num_ops, unsigned int flags,
1820 struct rbd_obj_request *obj_request)
1821{
1822 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1823 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001824 const char *name_format = rbd_dev->image_format == 1 ?
1825 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001826
1827 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
1828 if (!req)
1829 return NULL;
1830
1831 req->r_flags = flags;
1832 req->r_callback = rbd_osd_req_callback;
1833 req->r_priv = obj_request;
1834
1835 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001836 if (ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1837 rbd_dev->header.object_prefix, obj_request->object_no))
Ilya Dryomovbc812072017-01-25 18:16:23 +01001838 goto err_req;
1839
1840 if (ceph_osdc_alloc_messages(req, GFP_NOIO))
1841 goto err_req;
1842
1843 return req;
1844
1845err_req:
1846 ceph_osdc_put_request(req);
1847 return NULL;
1848}
1849
Alex Elderbf0d5f502012-11-22 00:00:08 -06001850static struct ceph_osd_request *rbd_osd_req_create(
1851 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001852 enum obj_operation_type op_type,
Ilya Dryomovdeb236b2014-02-25 16:22:27 +02001853 unsigned int num_ops,
Alex Elder430c28c2013-04-03 21:32:51 -05001854 struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001855{
Alex Elderbf0d5f502012-11-22 00:00:08 -06001856 struct ceph_snap_context *snapc = NULL;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001857
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001858 if (obj_request_img_data_test(obj_request) &&
1859 (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_WRITE)) {
Alex Elder6365d332013-02-11 12:33:24 -06001860 struct rbd_img_request *img_request = obj_request->img_request;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08001861 if (op_type == OBJ_OP_WRITE) {
1862 rbd_assert(img_request_write_test(img_request));
1863 } else {
1864 rbd_assert(img_request_discard_test(img_request));
1865 }
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001866 snapc = img_request->snapc;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001867 }
1868
Ilya Dryomovbc812072017-01-25 18:16:23 +01001869 return __rbd_osd_req_create(rbd_dev, snapc, num_ops,
1870 (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD) ?
Ilya Dryomov54ea0042017-02-11 18:48:41 +01001871 CEPH_OSD_FLAG_WRITE : CEPH_OSD_FLAG_READ, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001872}
1873
Alex Elder0eefd472013-04-19 15:34:50 -05001874/*
Josh Durgind3246fb2014-04-07 16:49:21 -07001875 * Create a copyup osd request based on the information in the object
1876 * request supplied. A copyup request has two or three osd ops, a
1877 * copyup method call, potentially a hint op, and a write or truncate
1878 * or zero op.
Alex Elder0eefd472013-04-19 15:34:50 -05001879 */
1880static struct ceph_osd_request *
1881rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
1882{
1883 struct rbd_img_request *img_request;
Josh Durgind3246fb2014-04-07 16:49:21 -07001884 int num_osd_ops = 3;
Alex Elder0eefd472013-04-19 15:34:50 -05001885
1886 rbd_assert(obj_request_img_data_test(obj_request));
1887 img_request = obj_request->img_request;
1888 rbd_assert(img_request);
Josh Durgind3246fb2014-04-07 16:49:21 -07001889 rbd_assert(img_request_write_test(img_request) ||
1890 img_request_discard_test(img_request));
Alex Elder0eefd472013-04-19 15:34:50 -05001891
Josh Durgind3246fb2014-04-07 16:49:21 -07001892 if (img_request_discard_test(img_request))
1893 num_osd_ops = 2;
1894
Ilya Dryomovbc812072017-01-25 18:16:23 +01001895 return __rbd_osd_req_create(img_request->rbd_dev,
1896 img_request->snapc, num_osd_ops,
Ilya Dryomov54ea0042017-02-11 18:48:41 +01001897 CEPH_OSD_FLAG_WRITE, obj_request);
Alex Elder0eefd472013-04-19 15:34:50 -05001898}
1899
Alex Elderbf0d5f502012-11-22 00:00:08 -06001900static void rbd_osd_req_destroy(struct ceph_osd_request *osd_req)
1901{
1902 ceph_osdc_put_request(osd_req);
1903}
1904
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001905static struct rbd_obj_request *
1906rbd_obj_request_create(enum obj_request_type type)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001907{
1908 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001909
1910 rbd_assert(obj_request_type_valid(type));
1911
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001912 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001913 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001914 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001915
Alex Elderbf0d5f502012-11-22 00:00:08 -06001916 obj_request->which = BAD_WHICH;
1917 obj_request->type = type;
1918 INIT_LIST_HEAD(&obj_request->links);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001919 kref_init(&obj_request->kref);
1920
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001921 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001922 return obj_request;
1923}
1924
1925static void rbd_obj_request_destroy(struct kref *kref)
1926{
1927 struct rbd_obj_request *obj_request;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001928 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001929
1930 obj_request = container_of(kref, struct rbd_obj_request, kref);
1931
Alex Elder37206ee2013-02-20 17:32:08 -06001932 dout("%s: obj %p\n", __func__, obj_request);
1933
Alex Elderbf0d5f502012-11-22 00:00:08 -06001934 rbd_assert(obj_request->img_request == NULL);
1935 rbd_assert(obj_request->which == BAD_WHICH);
1936
1937 if (obj_request->osd_req)
1938 rbd_osd_req_destroy(obj_request->osd_req);
1939
Alex Elderbf0d5f502012-11-22 00:00:08 -06001940 switch (obj_request->type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001941 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001942 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001943 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001944 break; /* Nothing to do */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001945 default:
1946 rbd_assert(0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001947 }
1948
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001949 if (obj_request->copyup_bvecs) {
1950 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1951 if (obj_request->copyup_bvecs[i].bv_page)
1952 __free_page(obj_request->copyup_bvecs[i].bv_page);
1953 }
1954 kfree(obj_request->copyup_bvecs);
1955 }
Ilya Dryomovf9dcbc42018-01-20 10:30:11 +01001956
Alex Elder868311b2013-05-01 12:43:03 -05001957 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001958}
1959
Alex Elderfb65d2282013-05-08 22:50:04 -05001960/* It's OK to call this for a device with no parent */
1961
1962static void rbd_spec_put(struct rbd_spec *spec);
1963static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1964{
1965 rbd_dev_remove_parent(rbd_dev);
1966 rbd_spec_put(rbd_dev->parent_spec);
1967 rbd_dev->parent_spec = NULL;
1968 rbd_dev->parent_overlap = 0;
1969}
1970
Alex Elderbf0d5f502012-11-22 00:00:08 -06001971/*
Alex Eldera2acd002013-05-08 22:50:04 -05001972 * Parent image reference counting is used to determine when an
1973 * image's parent fields can be safely torn down--after there are no
1974 * more in-flight requests to the parent image. When the last
1975 * reference is dropped, cleaning them up is safe.
1976 */
1977static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1978{
1979 int counter;
1980
1981 if (!rbd_dev->parent_spec)
1982 return;
1983
1984 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1985 if (counter > 0)
1986 return;
1987
1988 /* Last reference; clean up parent data structures */
1989
1990 if (!counter)
1991 rbd_dev_unparent(rbd_dev);
1992 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001993 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001994}
1995
1996/*
1997 * If an image has a non-zero parent overlap, get a reference to its
1998 * parent.
1999 *
2000 * Returns true if the rbd device has a parent with a non-zero
2001 * overlap and a reference for it was successfully taken, or
2002 * false otherwise.
2003 */
2004static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
2005{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002006 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002007
2008 if (!rbd_dev->parent_spec)
2009 return false;
2010
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002011 down_read(&rbd_dev->header_rwsem);
2012 if (rbd_dev->parent_overlap)
2013 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
2014 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05002015
2016 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04002017 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05002018
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03002019 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05002020}
2021
Alex Elderbf0d5f502012-11-22 00:00:08 -06002022/*
2023 * Caller is responsible for filling in the list of object requests
2024 * that comprises the image request, and the Linux request pointer
2025 * (if there is one).
2026 */
Alex Eldercc344fa2013-02-19 12:25:56 -06002027static struct rbd_img_request *rbd_img_request_create(
2028 struct rbd_device *rbd_dev,
Alex Elderbf0d5f502012-11-22 00:00:08 -06002029 u64 offset, u64 length,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002030 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07002031 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002032{
2033 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002034
Ilya Dryomova0c58952018-01-22 16:03:06 +01002035 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002036 if (!img_request)
2037 return NULL;
2038
Alex Elderbf0d5f502012-11-22 00:00:08 -06002039 img_request->rbd_dev = rbd_dev;
2040 img_request->offset = offset;
2041 img_request->length = length;
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002042 if (op_type == OBJ_OP_DISCARD) {
2043 img_request_discard_set(img_request);
2044 img_request->snapc = snapc;
2045 } else if (op_type == OBJ_OP_WRITE) {
Alex Elder0c425242013-02-08 09:55:49 -06002046 img_request_write_set(img_request);
Josh Durgin4e752f02014-04-08 11:12:11 -07002047 img_request->snapc = snapc;
Alex Elder0c425242013-02-08 09:55:49 -06002048 } else {
Alex Elderbf0d5f502012-11-22 00:00:08 -06002049 img_request->snap_id = rbd_dev->spec->snap_id;
Alex Elder0c425242013-02-08 09:55:49 -06002050 }
Alex Eldera2acd002013-05-08 22:50:04 -05002051 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06002052 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01002053
Alex Elderbf0d5f502012-11-22 00:00:08 -06002054 spin_lock_init(&img_request->completion_lock);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002055 INIT_LIST_HEAD(&img_request->obj_requests);
2056 kref_init(&img_request->kref);
2057
Alex Elder37206ee2013-02-20 17:32:08 -06002058 dout("%s: rbd_dev %p %s %llu/%llu -> img %p\n", __func__, rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002059 obj_op_name(op_type), offset, length, img_request);
Alex Elder37206ee2013-02-20 17:32:08 -06002060
Alex Elderbf0d5f502012-11-22 00:00:08 -06002061 return img_request;
2062}
2063
2064static void rbd_img_request_destroy(struct kref *kref)
2065{
2066 struct rbd_img_request *img_request;
2067 struct rbd_obj_request *obj_request;
2068 struct rbd_obj_request *next_obj_request;
2069
2070 img_request = container_of(kref, struct rbd_img_request, kref);
2071
Alex Elder37206ee2013-02-20 17:32:08 -06002072 dout("%s: img %p\n", __func__, img_request);
2073
Alex Elderbf0d5f502012-11-22 00:00:08 -06002074 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
2075 rbd_img_obj_request_del(img_request, obj_request);
Alex Elder25dcf952013-01-25 17:08:55 -06002076 rbd_assert(img_request->obj_request_count == 0);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002077
Alex Eldera2acd002013-05-08 22:50:04 -05002078 if (img_request_layered_test(img_request)) {
2079 img_request_layered_clear(img_request);
2080 rbd_dev_parent_put(img_request->rbd_dev);
2081 }
2082
Josh Durginbef95452014-04-04 17:47:52 -07002083 if (img_request_write_test(img_request) ||
2084 img_request_discard_test(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05002085 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002086
Alex Elder1c2a9df2013-05-01 12:43:03 -05002087 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002088}
2089
Alex Eldere93f3152013-05-08 22:50:04 -05002090static struct rbd_img_request *rbd_parent_request_create(
2091 struct rbd_obj_request *obj_request,
2092 u64 img_offset, u64 length)
2093{
2094 struct rbd_img_request *parent_request;
2095 struct rbd_device *rbd_dev;
2096
2097 rbd_assert(obj_request->img_request);
2098 rbd_dev = obj_request->img_request->rbd_dev;
2099
Josh Durgin4e752f02014-04-08 11:12:11 -07002100 parent_request = rbd_img_request_create(rbd_dev->parent, img_offset,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002101 length, OBJ_OP_READ, NULL);
Alex Eldere93f3152013-05-08 22:50:04 -05002102 if (!parent_request)
2103 return NULL;
2104
2105 img_request_child_set(parent_request);
2106 rbd_obj_request_get(obj_request);
2107 parent_request->obj_request = obj_request;
2108
2109 return parent_request;
2110}
2111
2112static void rbd_parent_request_destroy(struct kref *kref)
2113{
2114 struct rbd_img_request *parent_request;
2115 struct rbd_obj_request *orig_request;
2116
2117 parent_request = container_of(kref, struct rbd_img_request, kref);
2118 orig_request = parent_request->obj_request;
2119
2120 parent_request->obj_request = NULL;
2121 rbd_obj_request_put(orig_request);
2122 img_request_child_clear(parent_request);
2123
2124 rbd_img_request_destroy(kref);
2125}
2126
Alex Elder12178572013-02-08 09:55:49 -06002127static bool rbd_img_obj_end_request(struct rbd_obj_request *obj_request)
2128{
Alex Elder6365d332013-02-11 12:33:24 -06002129 struct rbd_img_request *img_request;
Alex Elder12178572013-02-08 09:55:49 -06002130 unsigned int xferred;
2131 int result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002132 bool more;
Alex Elder12178572013-02-08 09:55:49 -06002133
Alex Elder6365d332013-02-11 12:33:24 -06002134 rbd_assert(obj_request_img_data_test(obj_request));
2135 img_request = obj_request->img_request;
2136
Alex Elder12178572013-02-08 09:55:49 -06002137 rbd_assert(obj_request->xferred <= (u64)UINT_MAX);
2138 xferred = (unsigned int)obj_request->xferred;
2139 result = obj_request->result;
2140 if (result) {
2141 struct rbd_device *rbd_dev = img_request->rbd_dev;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002142 enum obj_operation_type op_type;
2143
Guangliang Zhao90e98c52014-04-01 22:22:16 +08002144 if (img_request_discard_test(img_request))
2145 op_type = OBJ_OP_DISCARD;
2146 else if (img_request_write_test(img_request))
2147 op_type = OBJ_OP_WRITE;
2148 else
2149 op_type = OBJ_OP_READ;
Alex Elder12178572013-02-08 09:55:49 -06002150
Ilya Dryomov9584d502014-07-11 12:11:20 +04002151 rbd_warn(rbd_dev, "%s %llx at %llx (%llx)",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08002152 obj_op_name(op_type), obj_request->length,
2153 obj_request->img_offset, obj_request->offset);
Ilya Dryomov9584d502014-07-11 12:11:20 +04002154 rbd_warn(rbd_dev, " result %d xferred %x",
Alex Elder12178572013-02-08 09:55:49 -06002155 result, xferred);
2156 if (!img_request->result)
2157 img_request->result = result;
Ilya Dryomov082a75d2015-04-25 15:56:15 +03002158 /*
2159 * Need to end I/O on the entire obj_request worth of
2160 * bytes in case of error.
2161 */
2162 xferred = obj_request->length;
Alex Elder12178572013-02-08 09:55:49 -06002163 }
2164
Alex Elder8b3e1a52013-01-24 16:13:36 -06002165 if (img_request_child_test(img_request)) {
2166 rbd_assert(img_request->obj_request != NULL);
2167 more = obj_request->which < img_request->obj_request_count - 1;
2168 } else {
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02002169 blk_status_t status = errno_to_blk_status(result);
2170
Alex Elder8b3e1a52013-01-24 16:13:36 -06002171 rbd_assert(img_request->rq != NULL);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01002172
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02002173 more = blk_update_request(img_request->rq, status, xferred);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01002174 if (!more)
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02002175 __blk_mq_end_request(img_request->rq, status);
Alex Elder8b3e1a52013-01-24 16:13:36 -06002176 }
2177
2178 return more;
Alex Elder12178572013-02-08 09:55:49 -06002179}
2180
Alex Elder21692382013-04-05 01:27:12 -05002181static void rbd_img_obj_callback(struct rbd_obj_request *obj_request)
2182{
2183 struct rbd_img_request *img_request;
2184 u32 which = obj_request->which;
2185 bool more = true;
2186
Alex Elder6365d332013-02-11 12:33:24 -06002187 rbd_assert(obj_request_img_data_test(obj_request));
Alex Elder21692382013-04-05 01:27:12 -05002188 img_request = obj_request->img_request;
2189
2190 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
2191 rbd_assert(img_request != NULL);
Alex Elder21692382013-04-05 01:27:12 -05002192 rbd_assert(img_request->obj_request_count > 0);
2193 rbd_assert(which != BAD_WHICH);
2194 rbd_assert(which < img_request->obj_request_count);
Alex Elder21692382013-04-05 01:27:12 -05002195
2196 spin_lock_irq(&img_request->completion_lock);
2197 if (which != img_request->next_completion)
2198 goto out;
2199
2200 for_each_obj_request_from(img_request, obj_request) {
Alex Elder21692382013-04-05 01:27:12 -05002201 rbd_assert(more);
2202 rbd_assert(which < img_request->obj_request_count);
2203
2204 if (!obj_request_done_test(obj_request))
2205 break;
Alex Elder12178572013-02-08 09:55:49 -06002206 more = rbd_img_obj_end_request(obj_request);
Alex Elder21692382013-04-05 01:27:12 -05002207 which++;
2208 }
2209
2210 rbd_assert(more ^ (which == img_request->obj_request_count));
2211 img_request->next_completion = which;
2212out:
2213 spin_unlock_irq(&img_request->completion_lock);
Alex Elder0f2d5be2014-04-26 14:21:44 +04002214 rbd_img_request_put(img_request);
Alex Elder21692382013-04-05 01:27:12 -05002215
2216 if (!more)
2217 rbd_img_request_complete(img_request);
2218}
2219
Alex Elderf1a47392013-04-19 15:34:50 -05002220/*
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002221 * Add individual osd ops to the given ceph_osd_request and prepare
2222 * them for submission. num_ops is the current number of
2223 * osd operations already to the object request.
2224 */
2225static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request,
2226 struct ceph_osd_request *osd_request,
2227 enum obj_operation_type op_type,
2228 unsigned int num_ops)
2229{
2230 struct rbd_img_request *img_request = obj_request->img_request;
2231 struct rbd_device *rbd_dev = img_request->rbd_dev;
2232 u64 object_size = rbd_obj_bytes(&rbd_dev->header);
2233 u64 offset = obj_request->offset;
2234 u64 length = obj_request->length;
2235 u64 img_end;
2236 u16 opcode;
2237
2238 if (op_type == OBJ_OP_DISCARD) {
Josh Durgind3246fb2014-04-07 16:49:21 -07002239 if (!offset && length == object_size &&
2240 (!img_request_layered_test(img_request) ||
2241 !obj_request_overlaps_parent(obj_request))) {
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002242 opcode = CEPH_OSD_OP_DELETE;
2243 } else if ((offset + length == object_size)) {
2244 opcode = CEPH_OSD_OP_TRUNCATE;
2245 } else {
2246 down_read(&rbd_dev->header_rwsem);
2247 img_end = rbd_dev->header.image_size;
2248 up_read(&rbd_dev->header_rwsem);
2249
2250 if (obj_request->img_offset + length == img_end)
2251 opcode = CEPH_OSD_OP_TRUNCATE;
2252 else
2253 opcode = CEPH_OSD_OP_ZERO;
2254 }
2255 } else if (op_type == OBJ_OP_WRITE) {
Ilya Dryomove30b7572015-10-07 17:27:17 +02002256 if (!offset && length == object_size)
2257 opcode = CEPH_OSD_OP_WRITEFULL;
2258 else
2259 opcode = CEPH_OSD_OP_WRITE;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002260 osd_req_op_alloc_hint_init(osd_request, num_ops,
2261 object_size, object_size);
2262 num_ops++;
2263 } else {
2264 opcode = CEPH_OSD_OP_READ;
2265 }
2266
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002267 if (opcode == CEPH_OSD_OP_DELETE)
Yan, Zheng144cba12015-04-27 11:09:54 +08002268 osd_req_op_init(osd_request, num_ops, opcode, 0);
Ilya Dryomov7e868b62014-11-21 22:16:43 +03002269 else
2270 osd_req_op_extent_init(osd_request, num_ops, opcode,
2271 offset, length, 0, 0);
2272
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002273 if (obj_request->type == OBJ_REQUEST_BIO)
2274 osd_req_op_extent_osd_data_bio(osd_request, num_ops,
Ilya Dryomov5359a172018-01-20 10:30:10 +01002275 &obj_request->bio_pos, length);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002276 else if (obj_request->type == OBJ_REQUEST_BVECS)
2277 osd_req_op_extent_osd_data_bvec_pos(osd_request, num_ops,
2278 &obj_request->bvec_pos);
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002279
2280 /* Discards are also writes */
2281 if (op_type == OBJ_OP_WRITE || op_type == OBJ_OP_DISCARD)
2282 rbd_osd_req_format_write(obj_request);
2283 else
2284 rbd_osd_req_format_read(obj_request);
2285}
2286
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002287static void rbd_osd_req_setup_data(struct rbd_obj_request *obj_req, u32 which)
2288{
2289 switch (obj_req->type) {
2290 case OBJ_REQUEST_BIO:
2291 osd_req_op_extent_osd_data_bio(obj_req->osd_req, which,
2292 &obj_req->bio_pos,
2293 obj_req->length);
2294 break;
2295 case OBJ_REQUEST_BVECS:
2296 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
2297 obj_req->length);
2298 osd_req_op_extent_osd_data_bvec_pos(obj_req->osd_req, which,
2299 &obj_req->bvec_pos);
2300 break;
2301 default:
2302 rbd_assert(0);
2303 }
2304}
2305
2306static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
2307{
2308 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2309
2310 obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1, obj_req);
2311 if (!obj_req->osd_req)
2312 return -ENOMEM;
2313
2314 osd_req_op_extent_init(obj_req->osd_req, 0, CEPH_OSD_OP_READ,
2315 obj_req->offset, obj_req->length, 0, 0);
2316 rbd_osd_req_setup_data(obj_req, 0);
2317
2318 rbd_osd_req_format_read(obj_req);
2319 return 0;
2320}
2321
2322static int __rbd_obj_setup_stat(struct rbd_obj_request *obj_req,
2323 unsigned int which)
2324{
2325 struct page **pages;
2326
2327 /*
2328 * The response data for a STAT call consists of:
2329 * le64 length;
2330 * struct {
2331 * le32 tv_sec;
2332 * le32 tv_nsec;
2333 * } mtime;
2334 */
2335 pages = ceph_alloc_page_vector(1, GFP_NOIO);
2336 if (IS_ERR(pages))
2337 return PTR_ERR(pages);
2338
2339 osd_req_op_init(obj_req->osd_req, which, CEPH_OSD_OP_STAT, 0);
2340 osd_req_op_raw_data_in_pages(obj_req->osd_req, which, pages,
2341 8 + sizeof(struct ceph_timespec),
2342 0, false, true);
2343 return 0;
2344}
2345
2346static void __rbd_obj_setup_write(struct rbd_obj_request *obj_req,
2347 unsigned int which)
2348{
2349 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2350 u16 opcode;
2351
2352 osd_req_op_alloc_hint_init(obj_req->osd_req, which++,
2353 rbd_dev->layout.object_size,
2354 rbd_dev->layout.object_size);
2355
2356 if (rbd_obj_is_entire(obj_req))
2357 opcode = CEPH_OSD_OP_WRITEFULL;
2358 else
2359 opcode = CEPH_OSD_OP_WRITE;
2360
2361 osd_req_op_extent_init(obj_req->osd_req, which, opcode,
2362 obj_req->offset, obj_req->length, 0, 0);
2363 rbd_osd_req_setup_data(obj_req, which++);
2364
2365 rbd_assert(which == obj_req->osd_req->r_num_ops);
2366 rbd_osd_req_format_write(obj_req);
2367}
2368
2369static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
2370{
2371 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2372 unsigned int num_osd_ops, which = 0;
2373 int ret;
2374
2375 if (obj_request_overlaps_parent(obj_req)) {
2376 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2377 num_osd_ops = 3; /* stat + setallochint + write/writefull */
2378 } else {
2379 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2380 num_osd_ops = 2; /* setallochint + write/writefull */
2381 }
2382
2383 obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_WRITE,
2384 num_osd_ops, obj_req);
2385 if (!obj_req->osd_req)
2386 return -ENOMEM;
2387
2388 if (obj_request_overlaps_parent(obj_req)) {
2389 ret = __rbd_obj_setup_stat(obj_req, which++);
2390 if (ret)
2391 return ret;
2392 }
2393
2394 __rbd_obj_setup_write(obj_req, which);
2395 return 0;
2396}
2397
2398static void __rbd_obj_setup_discard(struct rbd_obj_request *obj_req,
2399 unsigned int which)
2400{
2401 u16 opcode;
2402
2403 if (rbd_obj_is_entire(obj_req)) {
2404 if (obj_request_overlaps_parent(obj_req)) {
2405 opcode = CEPH_OSD_OP_TRUNCATE;
2406 } else {
2407 osd_req_op_init(obj_req->osd_req, which++,
2408 CEPH_OSD_OP_DELETE, 0);
2409 opcode = 0;
2410 }
2411 } else if (rbd_obj_is_tail(obj_req)) {
2412 opcode = CEPH_OSD_OP_TRUNCATE;
2413 } else {
2414 opcode = CEPH_OSD_OP_ZERO;
2415 }
2416
2417 if (opcode)
2418 osd_req_op_extent_init(obj_req->osd_req, which++, opcode,
2419 obj_req->offset, obj_req->length,
2420 0, 0);
2421
2422 rbd_assert(which == obj_req->osd_req->r_num_ops);
2423 rbd_osd_req_format_write(obj_req);
2424}
2425
2426static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
2427{
2428 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2429 unsigned int num_osd_ops, which = 0;
2430 int ret;
2431
2432 if (rbd_obj_is_entire(obj_req)) {
2433 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2434 num_osd_ops = 1; /* truncate/delete */
2435 } else {
2436 if (obj_request_overlaps_parent(obj_req)) {
2437 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
2438 num_osd_ops = 2; /* stat + truncate/zero */
2439 } else {
2440 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
2441 num_osd_ops = 1; /* truncate/zero */
2442 }
2443 }
2444
2445 obj_req->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_DISCARD,
2446 num_osd_ops, obj_req);
2447 if (!obj_req->osd_req)
2448 return -ENOMEM;
2449
2450 if (!rbd_obj_is_entire(obj_req) &&
2451 obj_request_overlaps_parent(obj_req)) {
2452 ret = __rbd_obj_setup_stat(obj_req, which++);
2453 if (ret)
2454 return ret;
2455 }
2456
2457 __rbd_obj_setup_discard(obj_req, which);
2458 return 0;
2459}
2460
2461/*
2462 * For each object request in @img_req, allocate an OSD request, add
2463 * individual OSD ops and prepare them for submission. The number of
2464 * OSD ops depends on op_type and the overlap point (if any).
2465 */
2466static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2467{
2468 struct rbd_obj_request *obj_req;
2469 int ret;
2470
2471 for_each_obj_request(img_req, obj_req) {
2472 switch (rbd_img_request_op_type(img_req)) {
2473 case OBJ_OP_READ:
2474 ret = rbd_obj_setup_read(obj_req);
2475 break;
2476 case OBJ_OP_WRITE:
2477 ret = rbd_obj_setup_write(obj_req);
2478 break;
2479 case OBJ_OP_DISCARD:
2480 ret = rbd_obj_setup_discard(obj_req);
2481 break;
2482 default:
2483 rbd_assert(0);
2484 }
2485 if (ret)
2486 return ret;
2487 }
2488
2489 return 0;
2490}
2491
Josh Durgin3b434a2a2014-04-04 17:32:15 -07002492/*
Alex Elderf1a47392013-04-19 15:34:50 -05002493 * Split up an image request into one or more object requests, each
2494 * to a different object. The "type" parameter indicates whether
2495 * "data_desc" is the pointer to the head of a list of bio
2496 * structures, or the base of a page array. In either case this
2497 * function assumes data_desc describes memory sufficient to hold
2498 * all data described by the image request.
2499 */
2500static int rbd_img_request_fill(struct rbd_img_request *img_request,
2501 enum obj_request_type type,
2502 void *data_desc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002503{
2504 struct rbd_device *rbd_dev = img_request->rbd_dev;
2505 struct rbd_obj_request *obj_request = NULL;
2506 struct rbd_obj_request *next_obj_request;
Ilya Dryomov5359a172018-01-20 10:30:10 +01002507 struct ceph_bio_iter bio_it;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002508 struct ceph_bvec_iter bvec_it;
Alex Elder7da22d22013-01-24 16:13:36 -06002509 u64 img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002510 u64 resid;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002511
Alex Elderf1a47392013-04-19 15:34:50 -05002512 dout("%s: img %p type %d data_desc %p\n", __func__, img_request,
2513 (int)type, data_desc);
Alex Elder37206ee2013-02-20 17:32:08 -06002514
Alex Elder7da22d22013-01-24 16:13:36 -06002515 img_offset = img_request->offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002516 resid = img_request->length;
Alex Elder4dda41d2013-02-20 21:59:33 -06002517 rbd_assert(resid > 0);
Alex Elderf1a47392013-04-19 15:34:50 -05002518
2519 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01002520 bio_it = *(struct ceph_bio_iter *)data_desc;
Kent Overstreet4f024f32013-10-11 15:44:27 -07002521 rbd_assert(img_offset ==
Ilya Dryomov5359a172018-01-20 10:30:10 +01002522 bio_it.iter.bi_sector << SECTOR_SHIFT);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002523 } else if (type == OBJ_REQUEST_BVECS) {
2524 bvec_it = *(struct ceph_bvec_iter *)data_desc;
Alex Elderf1a47392013-04-19 15:34:50 -05002525 }
2526
Alex Elderbf0d5f502012-11-22 00:00:08 -06002527 while (resid) {
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002528 u64 object_no = img_offset >> rbd_dev->header.obj_order;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002529 u64 offset = rbd_segment_offset(rbd_dev, img_offset);
2530 u64 length = rbd_segment_length(rbd_dev, img_offset, resid);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002531
Ilya Dryomov6c696d82017-01-25 18:16:23 +01002532 obj_request = rbd_obj_request_create(type);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002533 if (!obj_request)
2534 goto out_unwind;
Ilya Dryomov62054da2014-03-04 11:57:17 +02002535
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002536 obj_request->object_no = object_no;
Ilya Dryomov67e2b652017-01-25 18:16:22 +01002537 obj_request->offset = offset;
2538 obj_request->length = length;
2539
Josh Durgin03507db2013-08-27 14:45:46 -07002540 /*
2541 * set obj_request->img_request before creating the
2542 * osd_request so that it gets the right snapc
2543 */
2544 rbd_img_obj_request_add(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002545
Alex Elderf1a47392013-04-19 15:34:50 -05002546 if (type == OBJ_REQUEST_BIO) {
Ilya Dryomov5359a172018-01-20 10:30:10 +01002547 obj_request->bio_pos = bio_it;
2548 ceph_bio_iter_advance(&bio_it, length);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002549 } else if (type == OBJ_REQUEST_BVECS) {
2550 obj_request->bvec_pos = bvec_it;
2551 ceph_bvec_iter_shorten(&obj_request->bvec_pos, length);
2552 ceph_bvec_iter_advance(&bvec_it, length);
Alex Elderf1a47392013-04-19 15:34:50 -05002553 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06002554
Alex Elder21692382013-04-05 01:27:12 -05002555 obj_request->callback = rbd_img_obj_callback;
Alex Elder7da22d22013-01-24 16:13:36 -06002556 obj_request->img_offset = img_offset;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002557
Alex Elder7da22d22013-01-24 16:13:36 -06002558 img_offset += length;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002559 resid -= length;
2560 }
2561
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002562 return __rbd_img_fill_request(img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002563
Alex Elderbf0d5f502012-11-22 00:00:08 -06002564out_unwind:
2565 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
Ilya Dryomov42dd0372014-03-04 11:57:17 +02002566 rbd_img_obj_request_del(img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002567
2568 return -ENOMEM;
2569}
2570
Alex Elder3d7efd12013-04-19 15:34:50 -05002571static void
Ilya Dryomov27617132015-07-16 17:36:11 +03002572rbd_osd_copyup_callback(struct rbd_obj_request *obj_request)
Alex Elder0eefd472013-04-19 15:34:50 -05002573{
2574 struct rbd_img_request *img_request;
2575 struct rbd_device *rbd_dev;
Alex Elder0eefd472013-04-19 15:34:50 -05002576
Ilya Dryomov27617132015-07-16 17:36:11 +03002577 dout("%s: obj %p\n", __func__, obj_request);
2578
Josh Durgind3246fb2014-04-07 16:49:21 -07002579 rbd_assert(obj_request->type == OBJ_REQUEST_BIO ||
2580 obj_request->type == OBJ_REQUEST_NODATA);
Alex Elder0eefd472013-04-19 15:34:50 -05002581 rbd_assert(obj_request_img_data_test(obj_request));
2582 img_request = obj_request->img_request;
2583 rbd_assert(img_request);
2584
2585 rbd_dev = img_request->rbd_dev;
2586 rbd_assert(rbd_dev);
Alex Elder0eefd472013-04-19 15:34:50 -05002587
Alex Elder0eefd472013-04-19 15:34:50 -05002588 /*
2589 * We want the transfer count to reflect the size of the
2590 * original write request. There is no such thing as a
2591 * successful short write, so if the request was successful
2592 * we can just set it to the originally-requested length.
2593 */
2594 if (!obj_request->result)
2595 obj_request->xferred = obj_request->length;
2596
Ilya Dryomov27617132015-07-16 17:36:11 +03002597 obj_request_done_set(obj_request);
Alex Elder0eefd472013-04-19 15:34:50 -05002598}
2599
2600static void
Alex Elder3d7efd12013-04-19 15:34:50 -05002601rbd_img_obj_parent_read_full_callback(struct rbd_img_request *img_request)
2602{
2603 struct rbd_obj_request *orig_request;
Alex Elder0eefd472013-04-19 15:34:50 -05002604 struct ceph_osd_request *osd_req;
Alex Elder0eefd472013-04-19 15:34:50 -05002605 struct rbd_device *rbd_dev;
Josh Durgind3246fb2014-04-07 16:49:21 -07002606 enum obj_operation_type op_type;
Alex Elderbbea1c12013-05-06 17:40:33 -05002607 int img_result;
Alex Elderebda6402013-05-10 16:29:22 -05002608 u64 parent_length;
Alex Elder3d7efd12013-04-19 15:34:50 -05002609
2610 rbd_assert(img_request_child_test(img_request));
2611
2612 /* First get what we need from the image request */
2613
Alex Elder3d7efd12013-04-19 15:34:50 -05002614 orig_request = img_request->obj_request;
2615 rbd_assert(orig_request != NULL);
Alex Elderb91f09f2013-05-10 16:29:22 -05002616 rbd_assert(obj_request_type_valid(orig_request->type));
Alex Elderbbea1c12013-05-06 17:40:33 -05002617 img_result = img_request->result;
Alex Elderebda6402013-05-10 16:29:22 -05002618 parent_length = img_request->length;
Ilya Dryomovfa355112016-09-16 15:20:42 +02002619 rbd_assert(img_result || parent_length == img_request->xferred);
Alex Elder3d7efd12013-04-19 15:34:50 -05002620 rbd_img_request_put(img_request);
2621
Alex Elder91c6feb2013-05-06 17:40:32 -05002622 rbd_assert(orig_request->img_request);
2623 rbd_dev = orig_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002624 rbd_assert(rbd_dev);
Alex Elder3d7efd12013-04-19 15:34:50 -05002625
Alex Elderbbea1c12013-05-06 17:40:33 -05002626 /*
2627 * If the overlap has become 0 (most likely because the
2628 * image has been flattened) we need to free the pages
2629 * and re-submit the original write request.
2630 */
2631 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002632 rbd_obj_request_submit(orig_request);
2633 return;
Alex Elderbbea1c12013-05-06 17:40:33 -05002634 }
2635
2636 if (img_result)
Alex Elder0eefd472013-04-19 15:34:50 -05002637 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002638
Alex Elder8785b1d2013-05-09 10:08:49 -05002639 /*
2640 * The original osd request is of no use to use any more.
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002641 * We need a new one that can hold the three ops in a copyup
Alex Elder8785b1d2013-05-09 10:08:49 -05002642 * request. Allocate the new copyup osd request for the
2643 * original request, and release the old one.
2644 */
Alex Elderbbea1c12013-05-06 17:40:33 -05002645 img_result = -ENOMEM;
Alex Elder0eefd472013-04-19 15:34:50 -05002646 osd_req = rbd_osd_req_create_copyup(orig_request);
2647 if (!osd_req)
2648 goto out_err;
Alex Elder8785b1d2013-05-09 10:08:49 -05002649 rbd_osd_req_destroy(orig_request->osd_req);
Alex Elder0eefd472013-04-19 15:34:50 -05002650 orig_request->osd_req = osd_req;
Alex Elder3d7efd12013-04-19 15:34:50 -05002651
Alex Elder0eefd472013-04-19 15:34:50 -05002652 /* Initialize the copyup op */
2653
2654 osd_req_op_cls_init(osd_req, 0, CEPH_OSD_OP_CALL, "rbd", "copyup");
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002655 osd_req_op_cls_request_data_bvecs(osd_req, 0, orig_request->copyup_bvecs,
2656 parent_length);
Alex Elder0eefd472013-04-19 15:34:50 -05002657
Josh Durgind3246fb2014-04-07 16:49:21 -07002658 /* Add the other op(s) */
Ilya Dryomov0ccd5922014-02-25 16:22:28 +02002659
Josh Durgind3246fb2014-04-07 16:49:21 -07002660 op_type = rbd_img_request_op_type(orig_request->img_request);
2661 rbd_img_obj_request_fill(orig_request, osd_req, op_type, 1);
Alex Elder0eefd472013-04-19 15:34:50 -05002662
2663 /* All set, send it off. */
2664
Ilya Dryomov980917f2016-09-12 18:59:42 +02002665 rbd_obj_request_submit(orig_request);
2666 return;
Alex Elder0eefd472013-04-19 15:34:50 -05002667
Alex Elder0eefd472013-04-19 15:34:50 -05002668out_err:
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02002669 rbd_obj_request_error(orig_request, img_result);
Alex Elder3d7efd12013-04-19 15:34:50 -05002670}
2671
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002672static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap);
2673
Alex Elder3d7efd12013-04-19 15:34:50 -05002674/*
2675 * Read from the parent image the range of data that covers the
2676 * entire target of the given object request. This is used for
2677 * satisfying a layered image write request when the target of an
2678 * object request from the image request does not exist.
2679 *
2680 * A page array big enough to hold the returned data is allocated
2681 * and supplied to rbd_img_request_fill() as the "data descriptor."
2682 * When the read completes, this page array will be transferred to
2683 * the original object request for the copyup operation.
2684 *
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002685 * If an error occurs, it is recorded as the result of the original
2686 * object request in rbd_img_obj_exists_callback().
Alex Elder3d7efd12013-04-19 15:34:50 -05002687 */
2688static int rbd_img_obj_parent_read_full(struct rbd_obj_request *obj_request)
2689{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002690 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elder3d7efd12013-04-19 15:34:50 -05002691 struct rbd_img_request *parent_request = NULL;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002692 struct ceph_bvec_iter bvec_it = { 0 };
Alex Elder3d7efd12013-04-19 15:34:50 -05002693 u64 img_offset;
2694 u64 length;
Alex Elder3d7efd12013-04-19 15:34:50 -05002695 int result;
2696
Alex Elder3d7efd12013-04-19 15:34:50 -05002697 rbd_assert(rbd_dev->parent != NULL);
2698
2699 /*
2700 * Determine the byte range covered by the object in the
2701 * child image to which the original request was to be sent.
2702 */
2703 img_offset = obj_request->img_offset - obj_request->offset;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01002704 length = rbd_obj_bytes(&rbd_dev->header);
Alex Elder3d7efd12013-04-19 15:34:50 -05002705
2706 /*
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05002707 * There is no defined parent data beyond the parent
2708 * overlap, so limit what we read at that boundary if
2709 * necessary.
2710 */
2711 if (img_offset + length > rbd_dev->parent_overlap) {
2712 rbd_assert(img_offset < rbd_dev->parent_overlap);
2713 length = rbd_dev->parent_overlap - img_offset;
2714 }
2715
2716 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002717 * Allocate a page array big enough to receive the data read
2718 * from the parent.
2719 */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002720 result = setup_copyup_bvecs(obj_request, length);
2721 if (result)
Alex Elder3d7efd12013-04-19 15:34:50 -05002722 goto out_err;
Ilya Dryomovf9dcbc42018-01-20 10:30:11 +01002723
Alex Elder3d7efd12013-04-19 15:34:50 -05002724 result = -ENOMEM;
Alex Eldere93f3152013-05-08 22:50:04 -05002725 parent_request = rbd_parent_request_create(obj_request,
2726 img_offset, length);
Alex Elder3d7efd12013-04-19 15:34:50 -05002727 if (!parent_request)
2728 goto out_err;
Alex Elder3d7efd12013-04-19 15:34:50 -05002729
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002730 bvec_it.bvecs = obj_request->copyup_bvecs;
2731 bvec_it.iter.bi_size = length;
2732 result = rbd_img_request_fill(parent_request, OBJ_REQUEST_BVECS,
2733 &bvec_it);
Alex Elder3d7efd12013-04-19 15:34:50 -05002734 if (result)
2735 goto out_err;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002736
Alex Elder3d7efd12013-04-19 15:34:50 -05002737 parent_request->callback = rbd_img_obj_parent_read_full_callback;
Ilya Dryomov058aa992016-09-12 14:44:45 +02002738
Alex Elder3d7efd12013-04-19 15:34:50 -05002739 result = rbd_img_request_submit(parent_request);
2740 if (!result)
2741 return 0;
2742
Alex Elder3d7efd12013-04-19 15:34:50 -05002743out_err:
Alex Elder3d7efd12013-04-19 15:34:50 -05002744 if (parent_request)
2745 rbd_img_request_put(parent_request);
Alex Elder3d7efd12013-04-19 15:34:50 -05002746 return result;
2747}
2748
Alex Elderc5b5ef62013-02-11 12:33:24 -06002749static void rbd_img_obj_exists_callback(struct rbd_obj_request *obj_request)
2750{
Alex Elderc5b5ef62013-02-11 12:33:24 -06002751 struct rbd_obj_request *orig_request;
Alex Elder638f5ab2013-05-06 17:40:33 -05002752 struct rbd_device *rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002753 int result;
2754
2755 rbd_assert(!obj_request_img_data_test(obj_request));
2756
2757 /*
2758 * All we need from the object request is the original
2759 * request and the result of the STAT op. Grab those, then
2760 * we're done with the request.
2761 */
2762 orig_request = obj_request->obj_request;
2763 obj_request->obj_request = NULL;
Alex Elder912c3172013-05-13 20:35:38 -05002764 rbd_obj_request_put(orig_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002765 rbd_assert(orig_request);
2766 rbd_assert(orig_request->img_request);
2767
2768 result = obj_request->result;
2769 obj_request->result = 0;
2770
2771 dout("%s: obj %p for obj %p result %d %llu/%llu\n", __func__,
2772 obj_request, orig_request, result,
2773 obj_request->xferred, obj_request->length);
2774 rbd_obj_request_put(obj_request);
2775
Alex Elder638f5ab2013-05-06 17:40:33 -05002776 /*
2777 * If the overlap has become 0 (most likely because the
Ilya Dryomov980917f2016-09-12 18:59:42 +02002778 * image has been flattened) we need to re-submit the
2779 * original request.
Alex Elder638f5ab2013-05-06 17:40:33 -05002780 */
2781 rbd_dev = orig_request->img_request->rbd_dev;
2782 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002783 rbd_obj_request_submit(orig_request);
2784 return;
Alex Elder638f5ab2013-05-06 17:40:33 -05002785 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002786
2787 /*
2788 * Our only purpose here is to determine whether the object
2789 * exists, and we don't want to treat the non-existence as
2790 * an error. If something else comes back, transfer the
2791 * error to the original request and complete it now.
2792 */
2793 if (!result) {
2794 obj_request_existence_set(orig_request, true);
2795 } else if (result == -ENOENT) {
2796 obj_request_existence_set(orig_request, false);
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002797 } else {
2798 goto fail_orig_request;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002799 }
2800
2801 /*
2802 * Resubmit the original request now that we have recorded
2803 * whether the target object exists.
2804 */
Ilya Dryomovc2e82412016-09-13 20:18:01 +02002805 result = rbd_img_obj_request_submit(orig_request);
2806 if (result)
2807 goto fail_orig_request;
2808
2809 return;
2810
2811fail_orig_request:
Ilya Dryomov0dcc6852016-09-26 15:43:52 +02002812 rbd_obj_request_error(orig_request, result);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002813}
2814
2815static int rbd_img_obj_exists_submit(struct rbd_obj_request *obj_request)
2816{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002817 struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002818 struct rbd_obj_request *stat_request;
Ilya Dryomov710214e2016-09-15 17:53:32 +02002819 struct page **pages;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002820 int ret;
2821
Ilya Dryomov06fbb692018-01-20 10:30:10 +01002822 stat_request = rbd_obj_request_create(OBJ_REQUEST_NODATA);
Ilya Dryomov710214e2016-09-15 17:53:32 +02002823 if (!stat_request)
2824 return -ENOMEM;
2825
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01002826 stat_request->object_no = obj_request->object_no;
2827
Ilya Dryomov710214e2016-09-15 17:53:32 +02002828 stat_request->osd_req = rbd_osd_req_create(rbd_dev, OBJ_OP_READ, 1,
2829 stat_request);
2830 if (!stat_request->osd_req) {
2831 ret = -ENOMEM;
2832 goto fail_stat_request;
2833 }
2834
Alex Elderc5b5ef62013-02-11 12:33:24 -06002835 /*
2836 * The response data for a STAT call consists of:
2837 * le64 length;
2838 * struct {
2839 * le32 tv_sec;
2840 * le32 tv_nsec;
2841 * } mtime;
2842 */
Ilya Dryomov06fbb692018-01-20 10:30:10 +01002843 pages = ceph_alloc_page_vector(1, GFP_NOIO);
Ilya Dryomov710214e2016-09-15 17:53:32 +02002844 if (IS_ERR(pages)) {
2845 ret = PTR_ERR(pages);
2846 goto fail_stat_request;
2847 }
Alex Elderc5b5ef62013-02-11 12:33:24 -06002848
Ilya Dryomov710214e2016-09-15 17:53:32 +02002849 osd_req_op_init(stat_request->osd_req, 0, CEPH_OSD_OP_STAT, 0);
Ilya Dryomov06fbb692018-01-20 10:30:10 +01002850 osd_req_op_raw_data_in_pages(stat_request->osd_req, 0, pages,
2851 8 + sizeof(struct ceph_timespec),
2852 0, false, true);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002853
2854 rbd_obj_request_get(obj_request);
2855 stat_request->obj_request = obj_request;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002856 stat_request->callback = rbd_img_obj_exists_callback;
2857
Ilya Dryomov980917f2016-09-12 18:59:42 +02002858 rbd_obj_request_submit(stat_request);
2859 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06002860
Ilya Dryomov710214e2016-09-15 17:53:32 +02002861fail_stat_request:
2862 rbd_obj_request_put(stat_request);
Alex Elderc5b5ef62013-02-11 12:33:24 -06002863 return ret;
2864}
2865
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002866static bool img_obj_request_simple(struct rbd_obj_request *obj_request)
Alex Elderb454e362013-04-19 15:34:50 -05002867{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002868 struct rbd_img_request *img_request = obj_request->img_request;
2869 struct rbd_device *rbd_dev = img_request->rbd_dev;
Alex Elderb454e362013-04-19 15:34:50 -05002870
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002871 /* Reads */
Josh Durgin1c220882014-04-04 17:49:12 -07002872 if (!img_request_write_test(img_request) &&
2873 !img_request_discard_test(img_request))
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002874 return true;
Alex Elderb454e362013-04-19 15:34:50 -05002875
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002876 /* Non-layered writes */
2877 if (!img_request_layered_test(img_request))
2878 return true;
2879
2880 /*
2881 * Layered writes outside of the parent overlap range don't
2882 * share any data with the parent.
2883 */
2884 if (!obj_request_overlaps_parent(obj_request))
2885 return true;
2886
2887 /*
Guangliang Zhaoc622d222014-04-01 22:22:15 +08002888 * Entire-object layered writes - we will overwrite whatever
2889 * parent data there is anyway.
2890 */
2891 if (!obj_request->offset &&
2892 obj_request->length == rbd_obj_bytes(&rbd_dev->header))
2893 return true;
2894
2895 /*
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002896 * If the object is known to already exist, its parent data has
2897 * already been copied.
2898 */
2899 if (obj_request_known_test(obj_request) &&
2900 obj_request_exists_test(obj_request))
2901 return true;
2902
2903 return false;
2904}
2905
2906static int rbd_img_obj_request_submit(struct rbd_obj_request *obj_request)
2907{
Ilya Dryomov058aa992016-09-12 14:44:45 +02002908 rbd_assert(obj_request_img_data_test(obj_request));
2909 rbd_assert(obj_request_type_valid(obj_request->type));
2910 rbd_assert(obj_request->img_request);
2911
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002912 if (img_obj_request_simple(obj_request)) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02002913 rbd_obj_request_submit(obj_request);
2914 return 0;
Alex Elderb454e362013-04-19 15:34:50 -05002915 }
2916
2917 /*
Alex Elder3d7efd12013-04-19 15:34:50 -05002918 * It's a layered write. The target object might exist but
2919 * we may not know that yet. If we know it doesn't exist,
2920 * start by reading the data for the full target object from
2921 * the parent so we can use it for a copyup to the target.
Alex Elderb454e362013-04-19 15:34:50 -05002922 */
Ilya Dryomov70d045f2014-09-12 16:02:01 +04002923 if (obj_request_known_test(obj_request))
Alex Elder3d7efd12013-04-19 15:34:50 -05002924 return rbd_img_obj_parent_read_full(obj_request);
2925
2926 /* We don't know whether the target exists. Go find out. */
Alex Elderb454e362013-04-19 15:34:50 -05002927
2928 return rbd_img_obj_exists_submit(obj_request);
2929}
2930
Alex Elderbf0d5f502012-11-22 00:00:08 -06002931static int rbd_img_request_submit(struct rbd_img_request *img_request)
2932{
Alex Elderbf0d5f502012-11-22 00:00:08 -06002933 struct rbd_obj_request *obj_request;
Alex Elder46faeed2013-04-10 17:47:46 -05002934 struct rbd_obj_request *next_obj_request;
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002935 int ret = 0;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002936
Alex Elder37206ee2013-02-20 17:32:08 -06002937 dout("%s: img %p\n", __func__, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002938
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002939 rbd_img_request_get(img_request);
2940 for_each_obj_request_safe(img_request, obj_request, next_obj_request) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002941 rbd_obj_request_submit(obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002942 }
2943
Ilya Dryomov663ae2c2016-05-16 13:18:57 +02002944 rbd_img_request_put(img_request);
2945 return ret;
Alex Elderbf0d5f502012-11-22 00:00:08 -06002946}
2947
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002948static void rbd_img_end_child_request(struct rbd_img_request *img_req);
2949
2950static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req,
2951 u64 img_offset, u32 bytes)
2952{
2953 struct rbd_img_request *img_req = obj_req->img_request;
2954 struct rbd_img_request *child_img_req;
2955 int ret;
2956
2957 child_img_req = rbd_parent_request_create(obj_req, img_offset, bytes);
2958 if (!child_img_req)
2959 return -ENOMEM;
2960
2961 child_img_req->callback = rbd_img_end_child_request;
2962
2963 if (!rbd_img_is_write(img_req)) {
2964 switch (obj_req->type) {
2965 case OBJ_REQUEST_BIO:
2966 ret = rbd_img_request_fill(child_img_req,
2967 OBJ_REQUEST_BIO,
2968 &obj_req->bio_pos);
2969 break;
2970 case OBJ_REQUEST_BVECS:
2971 ret = rbd_img_request_fill(child_img_req,
2972 OBJ_REQUEST_BVECS,
2973 &obj_req->bvec_pos);
2974 break;
2975 default:
2976 rbd_assert(0);
2977 }
2978 } else {
2979 struct ceph_bvec_iter it = {
2980 .bvecs = obj_req->copyup_bvecs,
2981 .iter = { .bi_size = bytes },
2982 };
2983
2984 ret = rbd_img_request_fill(child_img_req, OBJ_REQUEST_BVECS,
2985 &it);
2986 }
2987 if (ret) {
2988 rbd_img_request_put(child_img_req);
2989 return ret;
2990 }
2991
2992 rbd_img_request_submit(child_img_req);
2993 return 0;
2994}
2995
2996static bool rbd_obj_handle_read(struct rbd_obj_request *obj_req)
2997{
2998 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2999 int ret;
3000
3001 if (obj_req->result == -ENOENT &&
3002 obj_req->img_offset < rbd_dev->parent_overlap &&
3003 !obj_req->tried_parent) {
3004 u64 obj_overlap = min(obj_req->length,
3005 rbd_dev->parent_overlap - obj_req->img_offset);
3006
3007 obj_req->tried_parent = true;
3008 ret = rbd_obj_read_from_parent(obj_req, obj_req->img_offset,
3009 obj_overlap);
3010 if (ret) {
3011 obj_req->result = ret;
3012 return true;
3013 }
3014 return false;
3015 }
3016
3017 /*
3018 * -ENOENT means a hole in the image -- zero-fill the entire
3019 * length of the request. A short read also implies zero-fill
3020 * to the end of the request. In both cases we update xferred
3021 * count to indicate the whole request was satisfied.
3022 */
3023 if (obj_req->result == -ENOENT ||
3024 (!obj_req->result && obj_req->xferred < obj_req->length)) {
3025 rbd_assert(!obj_req->xferred || !obj_req->result);
3026 rbd_obj_zero_range(obj_req, obj_req->xferred,
3027 obj_req->length - obj_req->xferred);
3028 obj_req->result = 0;
3029 obj_req->xferred = obj_req->length;
3030 }
3031
3032 return true;
3033}
3034
3035/*
3036 * copyup_bvecs pages are never highmem pages
3037 */
3038static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
3039{
3040 struct ceph_bvec_iter it = {
3041 .bvecs = bvecs,
3042 .iter = { .bi_size = bytes },
3043 };
3044
3045 ceph_bvec_iter_advance_step(&it, bytes, ({
3046 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
3047 bv.bv_len))
3048 return false;
3049 }));
3050 return true;
3051}
3052
3053static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
3054{
3055 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3056 unsigned int num_osd_ops = obj_req->osd_req->r_num_ops;
3057
3058 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
3059 rbd_assert(obj_req->osd_req->r_ops[0].op == CEPH_OSD_OP_STAT);
3060 rbd_osd_req_destroy(obj_req->osd_req);
3061
3062 /*
3063 * Create a copyup request with the same number of OSD ops as
3064 * the original request. The original request was stat + op(s),
3065 * the new copyup request will be copyup + the same op(s).
3066 */
3067 obj_req->osd_req = rbd_osd_req_create(rbd_dev,
3068 rbd_img_request_op_type(obj_req->img_request),
3069 num_osd_ops, obj_req);
3070 if (!obj_req->osd_req)
3071 return -ENOMEM;
3072
3073 /*
3074 * Only send non-zero copyup data to save some I/O and network
3075 * bandwidth -- zero copyup data is equivalent to the object not
3076 * existing.
3077 */
3078 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
3079 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
3080 bytes = 0;
3081 }
3082
3083 osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
3084 "copyup");
3085 osd_req_op_cls_request_data_bvecs(obj_req->osd_req, 0,
3086 obj_req->copyup_bvecs, bytes);
3087
3088 switch (rbd_img_request_op_type(obj_req->img_request)) {
3089 case OBJ_OP_WRITE:
3090 __rbd_obj_setup_write(obj_req, 1);
3091 break;
3092 case OBJ_OP_DISCARD:
3093 rbd_assert(!rbd_obj_is_entire(obj_req));
3094 __rbd_obj_setup_discard(obj_req, 1);
3095 break;
3096 default:
3097 rbd_assert(0);
3098 }
3099
3100 rbd_obj_request_submit(obj_req);
3101 /* FIXME: in lieu of rbd_img_obj_callback() */
3102 rbd_img_request_put(obj_req->img_request);
3103 return 0;
3104}
3105
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01003106static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
3107{
3108 u32 i;
3109
3110 rbd_assert(!obj_req->copyup_bvecs);
3111 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
3112 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
3113 sizeof(*obj_req->copyup_bvecs),
3114 GFP_NOIO);
3115 if (!obj_req->copyup_bvecs)
3116 return -ENOMEM;
3117
3118 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
3119 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
3120
3121 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
3122 if (!obj_req->copyup_bvecs[i].bv_page)
3123 return -ENOMEM;
3124
3125 obj_req->copyup_bvecs[i].bv_offset = 0;
3126 obj_req->copyup_bvecs[i].bv_len = len;
3127 obj_overlap -= len;
3128 }
3129
3130 rbd_assert(!obj_overlap);
3131 return 0;
3132}
3133
Alex Elder8b3e1a52013-01-24 16:13:36 -06003134static void rbd_img_parent_read_callback(struct rbd_img_request *img_request)
3135{
3136 struct rbd_obj_request *obj_request;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003137 struct rbd_device *rbd_dev;
3138 u64 obj_end;
Alex Elder02c74fb2013-05-06 17:40:33 -05003139 u64 img_xferred;
3140 int img_result;
Alex Elder8b3e1a52013-01-24 16:13:36 -06003141
3142 rbd_assert(img_request_child_test(img_request));
3143
Alex Elder02c74fb2013-05-06 17:40:33 -05003144 /* First get what we need from the image request and release it */
3145
Alex Elder8b3e1a52013-01-24 16:13:36 -06003146 obj_request = img_request->obj_request;
Alex Elder02c74fb2013-05-06 17:40:33 -05003147 img_xferred = img_request->xferred;
3148 img_result = img_request->result;
3149 rbd_img_request_put(img_request);
3150
3151 /*
3152 * If the overlap has become 0 (most likely because the
3153 * image has been flattened) we need to re-submit the
3154 * original request.
3155 */
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003156 rbd_assert(obj_request);
3157 rbd_assert(obj_request->img_request);
Alex Elder02c74fb2013-05-06 17:40:33 -05003158 rbd_dev = obj_request->img_request->rbd_dev;
3159 if (!rbd_dev->parent_overlap) {
Ilya Dryomov980917f2016-09-12 18:59:42 +02003160 rbd_obj_request_submit(obj_request);
3161 return;
Alex Elder02c74fb2013-05-06 17:40:33 -05003162 }
3163
3164 obj_request->result = img_result;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003165 if (obj_request->result)
3166 goto out;
3167
3168 /*
3169 * We need to zero anything beyond the parent overlap
3170 * boundary. Since rbd_img_obj_request_read_callback()
3171 * will zero anything beyond the end of a short read, an
3172 * easy way to do this is to pretend the data from the
3173 * parent came up short--ending at the overlap boundary.
3174 */
3175 rbd_assert(obj_request->img_offset < U64_MAX - obj_request->length);
3176 obj_end = obj_request->img_offset + obj_request->length;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003177 if (obj_end > rbd_dev->parent_overlap) {
3178 u64 xferred = 0;
3179
3180 if (obj_request->img_offset < rbd_dev->parent_overlap)
3181 xferred = rbd_dev->parent_overlap -
3182 obj_request->img_offset;
3183
Alex Elder02c74fb2013-05-06 17:40:33 -05003184 obj_request->xferred = min(img_xferred, xferred);
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003185 } else {
Alex Elder02c74fb2013-05-06 17:40:33 -05003186 obj_request->xferred = img_xferred;
Alex Eldera9e8ba2c2013-04-21 00:32:07 -05003187 }
3188out:
Alex Elder8b3e1a52013-01-24 16:13:36 -06003189 rbd_img_obj_request_read_callback(obj_request);
3190 rbd_obj_request_complete(obj_request);
3191}
3192
3193static void rbd_img_parent_read(struct rbd_obj_request *obj_request)
3194{
Alex Elder8b3e1a52013-01-24 16:13:36 -06003195 struct rbd_img_request *img_request;
3196 int result;
3197
3198 rbd_assert(obj_request_img_data_test(obj_request));
3199 rbd_assert(obj_request->img_request != NULL);
3200 rbd_assert(obj_request->result == (s32) -ENOENT);
Alex Elder5b2ab722013-05-06 17:40:33 -05003201 rbd_assert(obj_request_type_valid(obj_request->type));
Alex Elder8b3e1a52013-01-24 16:13:36 -06003202
Alex Elder8b3e1a52013-01-24 16:13:36 -06003203 /* rbd_read_finish(obj_request, obj_request->length); */
Alex Eldere93f3152013-05-08 22:50:04 -05003204 img_request = rbd_parent_request_create(obj_request,
Alex Elder8b3e1a52013-01-24 16:13:36 -06003205 obj_request->img_offset,
Alex Eldere93f3152013-05-08 22:50:04 -05003206 obj_request->length);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003207 result = -ENOMEM;
3208 if (!img_request)
3209 goto out_err;
3210
Alex Elder5b2ab722013-05-06 17:40:33 -05003211 if (obj_request->type == OBJ_REQUEST_BIO)
3212 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
Ilya Dryomov5359a172018-01-20 10:30:10 +01003213 &obj_request->bio_pos);
Alex Elder5b2ab722013-05-06 17:40:33 -05003214 else
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01003215 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BVECS,
3216 &obj_request->bvec_pos);
Alex Elder8b3e1a52013-01-24 16:13:36 -06003217 if (result)
3218 goto out_err;
3219
3220 img_request->callback = rbd_img_parent_read_callback;
3221 result = rbd_img_request_submit(img_request);
3222 if (result)
3223 goto out_err;
3224
3225 return;
3226out_err:
3227 if (img_request)
3228 rbd_img_request_put(img_request);
3229 obj_request->result = result;
3230 obj_request->xferred = 0;
3231 obj_request_done_set(obj_request);
3232}
3233
Ilya Dryomov3da691b2018-01-29 14:04:08 +01003234static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
3235{
3236 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
3237 u64 img_offset;
3238 u64 obj_overlap;
3239 int ret;
3240
3241 if (!obj_request_overlaps_parent(obj_req)) {
3242 /*
3243 * The overlap has become 0 (most likely because the
3244 * image has been flattened). Use rbd_obj_issue_copyup()
3245 * to re-submit the original write request -- the copyup
3246 * operation itself will be a no-op, since someone must
3247 * have populated the child object while we weren't
3248 * looking. Move to WRITE_FLAT state as we'll be done
3249 * with the operation once the null copyup completes.
3250 */
3251 obj_req->write_state = RBD_OBJ_WRITE_FLAT;
3252 return rbd_obj_issue_copyup(obj_req, 0);
3253 }
3254
3255 /*
3256 * Determine the byte range covered by the object in the
3257 * child image to which the original request was to be sent.
3258 */
3259 img_offset = obj_req->img_offset - obj_req->offset;
3260 obj_overlap = rbd_dev->layout.object_size;
3261
3262 /*
3263 * There is no defined parent data beyond the parent
3264 * overlap, so limit what we read at that boundary if
3265 * necessary.
3266 */
3267 if (img_offset + obj_overlap > rbd_dev->parent_overlap) {
3268 rbd_assert(img_offset < rbd_dev->parent_overlap);
3269 obj_overlap = rbd_dev->parent_overlap - img_offset;
3270 }
3271
3272 ret = setup_copyup_bvecs(obj_req, obj_overlap);
3273 if (ret)
3274 return ret;
3275
3276 obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
3277 return rbd_obj_read_from_parent(obj_req, img_offset, obj_overlap);
3278}
3279
3280static bool rbd_obj_handle_write(struct rbd_obj_request *obj_req)
3281{
3282 int ret;
3283
3284again:
3285 switch (obj_req->write_state) {
3286 case RBD_OBJ_WRITE_GUARD:
3287 rbd_assert(!obj_req->xferred);
3288 if (obj_req->result == -ENOENT) {
3289 /*
3290 * The target object doesn't exist. Read the data for
3291 * the entire target object up to the overlap point (if
3292 * any) from the parent, so we can use it for a copyup.
3293 */
3294 ret = rbd_obj_handle_write_guard(obj_req);
3295 if (ret) {
3296 obj_req->result = ret;
3297 return true;
3298 }
3299 return false;
3300 }
3301 /* fall through */
3302 case RBD_OBJ_WRITE_FLAT:
3303 if (!obj_req->result)
3304 /*
3305 * There is no such thing as a successful short
3306 * write -- indicate the whole request was satisfied.
3307 */
3308 obj_req->xferred = obj_req->length;
3309 return true;
3310 case RBD_OBJ_WRITE_COPYUP:
3311 obj_req->write_state = RBD_OBJ_WRITE_GUARD;
3312 if (obj_req->result)
3313 goto again;
3314
3315 rbd_assert(obj_req->xferred);
3316 ret = rbd_obj_issue_copyup(obj_req, obj_req->xferred);
3317 if (ret) {
3318 obj_req->result = ret;
3319 return true;
3320 }
3321 return false;
3322 default:
3323 rbd_assert(0);
3324 }
3325}
3326
3327/*
3328 * Returns true if @obj_req is completed, or false otherwise.
3329 */
3330static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req)
3331{
3332 switch (rbd_img_request_op_type(obj_req->img_request)) {
3333 case OBJ_OP_READ:
3334 return rbd_obj_handle_read(obj_req);
3335 case OBJ_OP_WRITE:
3336 return rbd_obj_handle_write(obj_req);
3337 case OBJ_OP_DISCARD:
3338 if (rbd_obj_handle_write(obj_req)) {
3339 /*
3340 * Hide -ENOENT from delete/truncate/zero -- discarding
3341 * a non-existent object is not a problem.
3342 */
3343 if (obj_req->result == -ENOENT) {
3344 obj_req->result = 0;
3345 obj_req->xferred = obj_req->length;
3346 }
3347 return true;
3348 }
3349 return false;
3350 default:
3351 rbd_assert(0);
3352 }
3353}
3354
3355static void rbd_img_end_child_request(struct rbd_img_request *img_req)
3356{
3357 struct rbd_obj_request *obj_req = img_req->obj_request;
3358
3359 rbd_assert(test_bit(IMG_REQ_CHILD, &img_req->flags));
3360
3361 obj_req->result = img_req->result;
3362 obj_req->xferred = img_req->xferred;
3363 rbd_img_request_put(img_req);
3364
3365 rbd_obj_handle_request(obj_req);
3366}
3367
3368static void rbd_obj_handle_request(struct rbd_obj_request *obj_req)
3369{
3370 if (!__rbd_obj_handle_request(obj_req))
3371 return;
3372
3373 obj_request_done_set(obj_req);
3374 rbd_obj_request_complete(obj_req);
3375}
3376
Ilya Dryomoved95b212016-08-12 16:40:02 +02003377static const struct rbd_client_id rbd_empty_cid;
3378
3379static bool rbd_cid_equal(const struct rbd_client_id *lhs,
3380 const struct rbd_client_id *rhs)
3381{
3382 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
3383}
3384
3385static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
3386{
3387 struct rbd_client_id cid;
3388
3389 mutex_lock(&rbd_dev->watch_mutex);
3390 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
3391 cid.handle = rbd_dev->watch_cookie;
3392 mutex_unlock(&rbd_dev->watch_mutex);
3393 return cid;
3394}
3395
3396/*
3397 * lock_rwsem must be held for write
3398 */
3399static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
3400 const struct rbd_client_id *cid)
3401{
3402 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
3403 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
3404 cid->gid, cid->handle);
3405 rbd_dev->owner_cid = *cid; /* struct */
3406}
3407
3408static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
3409{
3410 mutex_lock(&rbd_dev->watch_mutex);
3411 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
3412 mutex_unlock(&rbd_dev->watch_mutex);
3413}
3414
Florian Margaineedd8ca82017-12-13 16:43:59 +01003415static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
3416{
3417 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3418
3419 strcpy(rbd_dev->lock_cookie, cookie);
3420 rbd_set_owner_cid(rbd_dev, &cid);
3421 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
3422}
3423
Ilya Dryomoved95b212016-08-12 16:40:02 +02003424/*
3425 * lock_rwsem must be held for write
3426 */
3427static int rbd_lock(struct rbd_device *rbd_dev)
3428{
3429 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003430 char cookie[32];
3431 int ret;
3432
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003433 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
3434 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003435
3436 format_lock_cookie(rbd_dev, cookie);
3437 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
3438 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
3439 RBD_LOCK_TAG, "", 0);
3440 if (ret)
3441 return ret;
3442
3443 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01003444 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003445 return 0;
3446}
3447
3448/*
3449 * lock_rwsem must be held for write
3450 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02003451static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003452{
3453 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003454 int ret;
3455
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003456 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
3457 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02003458
Ilya Dryomoved95b212016-08-12 16:40:02 +02003459 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003460 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02003461 if (ret && ret != -ENOENT)
3462 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003463
Ilya Dryomovbbead742017-04-13 12:17:38 +02003464 /* treat errors as the image is unlocked */
3465 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003466 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02003467 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3468 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003469}
3470
3471static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3472 enum rbd_notify_op notify_op,
3473 struct page ***preply_pages,
3474 size_t *preply_len)
3475{
3476 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3477 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
3478 int buf_size = 4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN;
3479 char buf[buf_size];
3480 void *p = buf;
3481
3482 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3483
3484 /* encode *LockPayload NotifyMessage (op + ClientId) */
3485 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3486 ceph_encode_32(&p, notify_op);
3487 ceph_encode_64(&p, cid.gid);
3488 ceph_encode_64(&p, cid.handle);
3489
3490 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3491 &rbd_dev->header_oloc, buf, buf_size,
3492 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3493}
3494
3495static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3496 enum rbd_notify_op notify_op)
3497{
3498 struct page **reply_pages;
3499 size_t reply_len;
3500
3501 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3502 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3503}
3504
3505static void rbd_notify_acquired_lock(struct work_struct *work)
3506{
3507 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3508 acquired_lock_work);
3509
3510 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3511}
3512
3513static void rbd_notify_released_lock(struct work_struct *work)
3514{
3515 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3516 released_lock_work);
3517
3518 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3519}
3520
3521static int rbd_request_lock(struct rbd_device *rbd_dev)
3522{
3523 struct page **reply_pages;
3524 size_t reply_len;
3525 bool lock_owner_responded = false;
3526 int ret;
3527
3528 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3529
3530 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3531 &reply_pages, &reply_len);
3532 if (ret && ret != -ETIMEDOUT) {
3533 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3534 goto out;
3535 }
3536
3537 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3538 void *p = page_address(reply_pages[0]);
3539 void *const end = p + reply_len;
3540 u32 n;
3541
3542 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3543 while (n--) {
3544 u8 struct_v;
3545 u32 len;
3546
3547 ceph_decode_need(&p, end, 8 + 8, e_inval);
3548 p += 8 + 8; /* skip gid and cookie */
3549
3550 ceph_decode_32_safe(&p, end, len, e_inval);
3551 if (!len)
3552 continue;
3553
3554 if (lock_owner_responded) {
3555 rbd_warn(rbd_dev,
3556 "duplicate lock owners detected");
3557 ret = -EIO;
3558 goto out;
3559 }
3560
3561 lock_owner_responded = true;
3562 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3563 &struct_v, &len);
3564 if (ret) {
3565 rbd_warn(rbd_dev,
3566 "failed to decode ResponseMessage: %d",
3567 ret);
3568 goto e_inval;
3569 }
3570
3571 ret = ceph_decode_32(&p);
3572 }
3573 }
3574
3575 if (!lock_owner_responded) {
3576 rbd_warn(rbd_dev, "no lock owners detected");
3577 ret = -ETIMEDOUT;
3578 }
3579
3580out:
3581 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3582 return ret;
3583
3584e_inval:
3585 ret = -EINVAL;
3586 goto out;
3587}
3588
3589static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3590{
3591 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3592
3593 cancel_delayed_work(&rbd_dev->lock_dwork);
3594 if (wake_all)
3595 wake_up_all(&rbd_dev->lock_waitq);
3596 else
3597 wake_up(&rbd_dev->lock_waitq);
3598}
3599
3600static int get_lock_owner_info(struct rbd_device *rbd_dev,
3601 struct ceph_locker **lockers, u32 *num_lockers)
3602{
3603 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3604 u8 lock_type;
3605 char *lock_tag;
3606 int ret;
3607
3608 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3609
3610 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3611 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3612 &lock_type, &lock_tag, lockers, num_lockers);
3613 if (ret)
3614 return ret;
3615
3616 if (*num_lockers == 0) {
3617 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3618 goto out;
3619 }
3620
3621 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3622 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3623 lock_tag);
3624 ret = -EBUSY;
3625 goto out;
3626 }
3627
3628 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3629 rbd_warn(rbd_dev, "shared lock type detected");
3630 ret = -EBUSY;
3631 goto out;
3632 }
3633
3634 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3635 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3636 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3637 (*lockers)[0].id.cookie);
3638 ret = -EBUSY;
3639 goto out;
3640 }
3641
3642out:
3643 kfree(lock_tag);
3644 return ret;
3645}
3646
3647static int find_watcher(struct rbd_device *rbd_dev,
3648 const struct ceph_locker *locker)
3649{
3650 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3651 struct ceph_watch_item *watchers;
3652 u32 num_watchers;
3653 u64 cookie;
3654 int i;
3655 int ret;
3656
3657 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3658 &rbd_dev->header_oloc, &watchers,
3659 &num_watchers);
3660 if (ret)
3661 return ret;
3662
3663 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3664 for (i = 0; i < num_watchers; i++) {
3665 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3666 sizeof(locker->info.addr)) &&
3667 watchers[i].cookie == cookie) {
3668 struct rbd_client_id cid = {
3669 .gid = le64_to_cpu(watchers[i].name.num),
3670 .handle = cookie,
3671 };
3672
3673 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3674 rbd_dev, cid.gid, cid.handle);
3675 rbd_set_owner_cid(rbd_dev, &cid);
3676 ret = 1;
3677 goto out;
3678 }
3679 }
3680
3681 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3682 ret = 0;
3683out:
3684 kfree(watchers);
3685 return ret;
3686}
3687
3688/*
3689 * lock_rwsem must be held for write
3690 */
3691static int rbd_try_lock(struct rbd_device *rbd_dev)
3692{
3693 struct ceph_client *client = rbd_dev->rbd_client->client;
3694 struct ceph_locker *lockers;
3695 u32 num_lockers;
3696 int ret;
3697
3698 for (;;) {
3699 ret = rbd_lock(rbd_dev);
3700 if (ret != -EBUSY)
3701 return ret;
3702
3703 /* determine if the current lock holder is still alive */
3704 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3705 if (ret)
3706 return ret;
3707
3708 if (num_lockers == 0)
3709 goto again;
3710
3711 ret = find_watcher(rbd_dev, lockers);
3712 if (ret) {
3713 if (ret > 0)
3714 ret = 0; /* have to request lock */
3715 goto out;
3716 }
3717
3718 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3719 ENTITY_NAME(lockers[0].id.name));
3720
3721 ret = ceph_monc_blacklist_add(&client->monc,
3722 &lockers[0].info.addr);
3723 if (ret) {
3724 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3725 ENTITY_NAME(lockers[0].id.name), ret);
3726 goto out;
3727 }
3728
3729 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3730 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3731 lockers[0].id.cookie,
3732 &lockers[0].id.name);
3733 if (ret && ret != -ENOENT)
3734 goto out;
3735
3736again:
3737 ceph_free_lockers(lockers, num_lockers);
3738 }
3739
3740out:
3741 ceph_free_lockers(lockers, num_lockers);
3742 return ret;
3743}
3744
3745/*
3746 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3747 */
3748static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3749 int *pret)
3750{
3751 enum rbd_lock_state lock_state;
3752
3753 down_read(&rbd_dev->lock_rwsem);
3754 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3755 rbd_dev->lock_state);
3756 if (__rbd_is_lock_owner(rbd_dev)) {
3757 lock_state = rbd_dev->lock_state;
3758 up_read(&rbd_dev->lock_rwsem);
3759 return lock_state;
3760 }
3761
3762 up_read(&rbd_dev->lock_rwsem);
3763 down_write(&rbd_dev->lock_rwsem);
3764 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3765 rbd_dev->lock_state);
3766 if (!__rbd_is_lock_owner(rbd_dev)) {
3767 *pret = rbd_try_lock(rbd_dev);
3768 if (*pret)
3769 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3770 }
3771
3772 lock_state = rbd_dev->lock_state;
3773 up_write(&rbd_dev->lock_rwsem);
3774 return lock_state;
3775}
3776
3777static void rbd_acquire_lock(struct work_struct *work)
3778{
3779 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3780 struct rbd_device, lock_dwork);
3781 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003782 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003783
3784 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3785again:
3786 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3787 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3788 if (lock_state == RBD_LOCK_STATE_LOCKED)
3789 wake_requests(rbd_dev, true);
3790 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3791 rbd_dev, lock_state, ret);
3792 return;
3793 }
3794
3795 ret = rbd_request_lock(rbd_dev);
3796 if (ret == -ETIMEDOUT) {
3797 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003798 } else if (ret == -EROFS) {
3799 rbd_warn(rbd_dev, "peer will not release lock");
3800 /*
3801 * If this is rbd_add_acquire_lock(), we want to fail
3802 * immediately -- reuse BLACKLISTED flag. Otherwise we
3803 * want to block.
3804 */
3805 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3806 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3807 /* wake "rbd map --exclusive" process */
3808 wake_requests(rbd_dev, false);
3809 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003810 } else if (ret < 0) {
3811 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3812 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3813 RBD_RETRY_DELAY);
3814 } else {
3815 /*
3816 * lock owner acked, but resend if we don't see them
3817 * release the lock
3818 */
3819 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3820 rbd_dev);
3821 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3822 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3823 }
3824}
3825
3826/*
3827 * lock_rwsem must be held for write
3828 */
3829static bool rbd_release_lock(struct rbd_device *rbd_dev)
3830{
3831 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3832 rbd_dev->lock_state);
3833 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3834 return false;
3835
3836 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3837 downgrade_write(&rbd_dev->lock_rwsem);
3838 /*
3839 * Ensure that all in-flight IO is flushed.
3840 *
3841 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3842 * may be shared with other devices.
3843 */
3844 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3845 up_read(&rbd_dev->lock_rwsem);
3846
3847 down_write(&rbd_dev->lock_rwsem);
3848 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3849 rbd_dev->lock_state);
3850 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3851 return false;
3852
Ilya Dryomovbbead742017-04-13 12:17:38 +02003853 rbd_unlock(rbd_dev);
3854 /*
3855 * Give others a chance to grab the lock - we would re-acquire
3856 * almost immediately if we got new IO during ceph_osdc_sync()
3857 * otherwise. We need to ack our own notifications, so this
3858 * lock_dwork will be requeued from rbd_wait_state_locked()
3859 * after wake_requests() in rbd_handle_released_lock().
3860 */
3861 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003862 return true;
3863}
3864
3865static void rbd_release_lock_work(struct work_struct *work)
3866{
3867 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3868 unlock_work);
3869
3870 down_write(&rbd_dev->lock_rwsem);
3871 rbd_release_lock(rbd_dev);
3872 up_write(&rbd_dev->lock_rwsem);
3873}
3874
3875static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3876 void **p)
3877{
3878 struct rbd_client_id cid = { 0 };
3879
3880 if (struct_v >= 2) {
3881 cid.gid = ceph_decode_64(p);
3882 cid.handle = ceph_decode_64(p);
3883 }
3884
3885 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3886 cid.handle);
3887 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3888 down_write(&rbd_dev->lock_rwsem);
3889 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3890 /*
3891 * we already know that the remote client is
3892 * the owner
3893 */
3894 up_write(&rbd_dev->lock_rwsem);
3895 return;
3896 }
3897
3898 rbd_set_owner_cid(rbd_dev, &cid);
3899 downgrade_write(&rbd_dev->lock_rwsem);
3900 } else {
3901 down_read(&rbd_dev->lock_rwsem);
3902 }
3903
3904 if (!__rbd_is_lock_owner(rbd_dev))
3905 wake_requests(rbd_dev, false);
3906 up_read(&rbd_dev->lock_rwsem);
3907}
3908
3909static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3910 void **p)
3911{
3912 struct rbd_client_id cid = { 0 };
3913
3914 if (struct_v >= 2) {
3915 cid.gid = ceph_decode_64(p);
3916 cid.handle = ceph_decode_64(p);
3917 }
3918
3919 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3920 cid.handle);
3921 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3922 down_write(&rbd_dev->lock_rwsem);
3923 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3924 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3925 __func__, rbd_dev, cid.gid, cid.handle,
3926 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3927 up_write(&rbd_dev->lock_rwsem);
3928 return;
3929 }
3930
3931 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3932 downgrade_write(&rbd_dev->lock_rwsem);
3933 } else {
3934 down_read(&rbd_dev->lock_rwsem);
3935 }
3936
3937 if (!__rbd_is_lock_owner(rbd_dev))
3938 wake_requests(rbd_dev, false);
3939 up_read(&rbd_dev->lock_rwsem);
3940}
3941
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003942/*
3943 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3944 * ResponseMessage is needed.
3945 */
3946static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3947 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003948{
3949 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3950 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003951 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003952
3953 if (struct_v >= 2) {
3954 cid.gid = ceph_decode_64(p);
3955 cid.handle = ceph_decode_64(p);
3956 }
3957
3958 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3959 cid.handle);
3960 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003961 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003962
3963 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003964 if (__rbd_is_lock_owner(rbd_dev)) {
3965 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3966 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3967 goto out_unlock;
3968
3969 /*
3970 * encode ResponseMessage(0) so the peer can detect
3971 * a missing owner
3972 */
3973 result = 0;
3974
3975 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003976 if (!rbd_dev->opts->exclusive) {
3977 dout("%s rbd_dev %p queueing unlock_work\n",
3978 __func__, rbd_dev);
3979 queue_work(rbd_dev->task_wq,
3980 &rbd_dev->unlock_work);
3981 } else {
3982 /* refuse to release the lock */
3983 result = -EROFS;
3984 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003985 }
3986 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003987
3988out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003989 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003990 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003991}
3992
3993static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3994 u64 notify_id, u64 cookie, s32 *result)
3995{
3996 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3997 int buf_size = 4 + CEPH_ENCODING_START_BLK_LEN;
3998 char buf[buf_size];
3999 int ret;
4000
4001 if (result) {
4002 void *p = buf;
4003
4004 /* encode ResponseMessage */
4005 ceph_start_encoding(&p, 1, 1,
4006 buf_size - CEPH_ENCODING_START_BLK_LEN);
4007 ceph_encode_32(&p, *result);
4008 } else {
4009 buf_size = 0;
4010 }
4011
4012 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
4013 &rbd_dev->header_oloc, notify_id, cookie,
4014 buf, buf_size);
4015 if (ret)
4016 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
4017}
4018
4019static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
4020 u64 cookie)
4021{
4022 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4023 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
4024}
4025
4026static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
4027 u64 notify_id, u64 cookie, s32 result)
4028{
4029 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
4030 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
4031}
Ilya Dryomov922dab62016-05-26 01:15:02 +02004032
4033static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
4034 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06004035{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004036 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004037 void *p = data;
4038 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02004039 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004040 u32 len;
4041 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06004042 int ret;
4043
Ilya Dryomoved95b212016-08-12 16:40:02 +02004044 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
4045 __func__, rbd_dev, cookie, notify_id, data_len);
4046 if (data_len) {
4047 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
4048 &struct_v, &len);
4049 if (ret) {
4050 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
4051 ret);
4052 return;
4053 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004054
Ilya Dryomoved95b212016-08-12 16:40:02 +02004055 notify_op = ceph_decode_32(&p);
4056 } else {
4057 /* legacy notification for header updates */
4058 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
4059 len = 0;
4060 }
Alex Elderb8d70032012-11-30 17:53:04 -06004061
Ilya Dryomoved95b212016-08-12 16:40:02 +02004062 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
4063 switch (notify_op) {
4064 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
4065 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
4066 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4067 break;
4068 case RBD_NOTIFY_OP_RELEASED_LOCK:
4069 rbd_handle_released_lock(rbd_dev, struct_v, &p);
4070 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4071 break;
4072 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004073 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
4074 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02004075 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02004076 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004077 else
4078 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4079 break;
4080 case RBD_NOTIFY_OP_HEADER_UPDATE:
4081 ret = rbd_dev_refresh(rbd_dev);
4082 if (ret)
4083 rbd_warn(rbd_dev, "refresh failed: %d", ret);
4084
4085 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4086 break;
4087 default:
4088 if (rbd_is_lock_owner(rbd_dev))
4089 rbd_acknowledge_notify_result(rbd_dev, notify_id,
4090 cookie, -EOPNOTSUPP);
4091 else
4092 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
4093 break;
4094 }
Alex Elderb8d70032012-11-30 17:53:04 -06004095}
4096
Ilya Dryomov99d16942016-08-12 16:11:41 +02004097static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
4098
Ilya Dryomov922dab62016-05-26 01:15:02 +02004099static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004100{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004101 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004102
Ilya Dryomov922dab62016-05-26 01:15:02 +02004103 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004104
Ilya Dryomoved95b212016-08-12 16:40:02 +02004105 down_write(&rbd_dev->lock_rwsem);
4106 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
4107 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004108
Ilya Dryomov99d16942016-08-12 16:11:41 +02004109 mutex_lock(&rbd_dev->watch_mutex);
4110 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
4111 __rbd_unregister_watch(rbd_dev);
4112 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004113
Ilya Dryomov99d16942016-08-12 16:11:41 +02004114 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004115 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02004116 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04004117}
4118
4119/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02004120 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06004121 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02004122static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06004123{
4124 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02004125 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06004126
Ilya Dryomov922dab62016-05-26 01:15:02 +02004127 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004128 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06004129
Ilya Dryomov922dab62016-05-26 01:15:02 +02004130 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
4131 &rbd_dev->header_oloc, rbd_watch_cb,
4132 rbd_watch_errcb, rbd_dev);
4133 if (IS_ERR(handle))
4134 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06004135
Ilya Dryomov922dab62016-05-26 01:15:02 +02004136 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004137 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06004138}
4139
Ilya Dryomov99d16942016-08-12 16:11:41 +02004140/*
4141 * watch_mutex must be locked
4142 */
4143static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02004144{
Ilya Dryomov922dab62016-05-26 01:15:02 +02004145 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4146 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004147
Ilya Dryomov99d16942016-08-12 16:11:41 +02004148 rbd_assert(rbd_dev->watch_handle);
4149 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004150
Ilya Dryomov922dab62016-05-26 01:15:02 +02004151 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
4152 if (ret)
4153 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04004154
Ilya Dryomov922dab62016-05-26 01:15:02 +02004155 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02004156}
4157
Ilya Dryomov99d16942016-08-12 16:11:41 +02004158static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02004159{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004160 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02004161
Ilya Dryomov99d16942016-08-12 16:11:41 +02004162 mutex_lock(&rbd_dev->watch_mutex);
4163 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
4164 ret = __rbd_register_watch(rbd_dev);
4165 if (ret)
4166 goto out;
4167
4168 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4169 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4170
4171out:
4172 mutex_unlock(&rbd_dev->watch_mutex);
4173 return ret;
4174}
4175
4176static void cancel_tasks_sync(struct rbd_device *rbd_dev)
4177{
4178 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4179
4180 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004181 cancel_work_sync(&rbd_dev->acquired_lock_work);
4182 cancel_work_sync(&rbd_dev->released_lock_work);
4183 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
4184 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004185}
4186
4187static void rbd_unregister_watch(struct rbd_device *rbd_dev)
4188{
Ilya Dryomoved95b212016-08-12 16:40:02 +02004189 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02004190 cancel_tasks_sync(rbd_dev);
4191
4192 mutex_lock(&rbd_dev->watch_mutex);
4193 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
4194 __rbd_unregister_watch(rbd_dev);
4195 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4196 mutex_unlock(&rbd_dev->watch_mutex);
4197
Ilya Dryomov811c6682016-04-15 16:22:16 +02004198 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02004199}
4200
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004201/*
4202 * lock_rwsem must be held for write
4203 */
4204static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
4205{
4206 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4207 char cookie[32];
4208 int ret;
4209
4210 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
4211
4212 format_lock_cookie(rbd_dev, cookie);
4213 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
4214 &rbd_dev->header_oloc, RBD_LOCK_NAME,
4215 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
4216 RBD_LOCK_TAG, cookie);
4217 if (ret) {
4218 if (ret != -EOPNOTSUPP)
4219 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
4220 ret);
4221
4222 /*
4223 * Lock cookie cannot be updated on older OSDs, so do
4224 * a manual release and queue an acquire.
4225 */
4226 if (rbd_release_lock(rbd_dev))
4227 queue_delayed_work(rbd_dev->task_wq,
4228 &rbd_dev->lock_dwork, 0);
4229 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01004230 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004231 }
4232}
4233
Ilya Dryomov99d16942016-08-12 16:11:41 +02004234static void rbd_reregister_watch(struct work_struct *work)
4235{
4236 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
4237 struct rbd_device, watch_dwork);
4238 int ret;
4239
4240 dout("%s rbd_dev %p\n", __func__, rbd_dev);
4241
4242 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004243 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
4244 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004245 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004246 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02004247
4248 ret = __rbd_register_watch(rbd_dev);
4249 if (ret) {
4250 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02004251 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004252 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004253 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004254 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02004255 queue_delayed_work(rbd_dev->task_wq,
4256 &rbd_dev->watch_dwork,
4257 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004258 }
4259 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004260 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02004261 }
4262
4263 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
4264 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
4265 mutex_unlock(&rbd_dev->watch_mutex);
4266
Ilya Dryomov14bb2112017-04-13 12:17:38 +02004267 down_write(&rbd_dev->lock_rwsem);
4268 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
4269 rbd_reacquire_lock(rbd_dev);
4270 up_write(&rbd_dev->lock_rwsem);
4271
Ilya Dryomov99d16942016-08-12 16:11:41 +02004272 ret = rbd_dev_refresh(rbd_dev);
4273 if (ret)
4274 rbd_warn(rbd_dev, "reregisteration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02004275}
4276
Alex Elder36be9a72013-01-19 00:30:28 -06004277/*
Alex Elderf40eb342013-04-25 15:09:42 -05004278 * Synchronous osd object method call. Returns the number of bytes
4279 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06004280 */
4281static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004282 struct ceph_object_id *oid,
4283 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06004284 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05004285 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06004286 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05004287 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05004288 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06004289{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004290 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4291 struct page *req_page = NULL;
4292 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06004293 int ret;
4294
4295 /*
Alex Elder6010a452013-04-05 01:27:11 -05004296 * Method calls are ultimately read operations. The result
4297 * should placed into the inbound buffer provided. They
4298 * also supply outbound data--parameters for the object
4299 * method. Currently if this is present it will be a
4300 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06004301 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004302 if (outbound) {
4303 if (outbound_size > PAGE_SIZE)
4304 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06004305
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004306 req_page = alloc_page(GFP_KERNEL);
4307 if (!req_page)
4308 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06004309
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004310 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05004311 }
Alex Elder430c28c2013-04-03 21:32:51 -05004312
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004313 reply_page = alloc_page(GFP_KERNEL);
4314 if (!reply_page) {
4315 if (req_page)
4316 __free_page(req_page);
4317 return -ENOMEM;
4318 }
Alex Elder36be9a72013-01-19 00:30:28 -06004319
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004320 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
4321 CEPH_OSD_FLAG_READ, req_page, outbound_size,
4322 reply_page, &inbound_size);
4323 if (!ret) {
4324 memcpy(inbound, page_address(reply_page), inbound_size);
4325 ret = inbound_size;
4326 }
Alex Elder57385b52013-04-21 12:14:45 -05004327
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004328 if (req_page)
4329 __free_page(req_page);
4330 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06004331 return ret;
4332}
4333
Ilya Dryomoved95b212016-08-12 16:40:02 +02004334/*
4335 * lock_rwsem must be held for read
4336 */
4337static void rbd_wait_state_locked(struct rbd_device *rbd_dev)
4338{
4339 DEFINE_WAIT(wait);
4340
4341 do {
4342 /*
4343 * Note the use of mod_delayed_work() in rbd_acquire_lock()
4344 * and cancel_delayed_work() in wake_requests().
4345 */
4346 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
4347 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
4348 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
4349 TASK_UNINTERRUPTIBLE);
4350 up_read(&rbd_dev->lock_rwsem);
4351 schedule();
4352 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004353 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
4354 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags));
4355
Ilya Dryomoved95b212016-08-12 16:40:02 +02004356 finish_wait(&rbd_dev->lock_waitq, &wait);
4357}
4358
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004359static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004360{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004361 struct request *rq = blk_mq_rq_from_pdu(work);
4362 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004363 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07004364 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004365 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
4366 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004367 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07004368 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02004369 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004370 int result;
4371
Christoph Hellwigaebf5262017-01-31 16:57:31 +01004372 switch (req_op(rq)) {
4373 case REQ_OP_DISCARD:
Ilya Dryomov6ac56952017-05-22 19:59:24 +02004374 case REQ_OP_WRITE_ZEROES:
Christoph Hellwigaebf5262017-01-31 16:57:31 +01004375 op_type = OBJ_OP_DISCARD;
4376 break;
4377 case REQ_OP_WRITE:
4378 op_type = OBJ_OP_WRITE;
4379 break;
4380 case REQ_OP_READ:
4381 op_type = OBJ_OP_READ;
4382 break;
4383 default:
4384 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004385 result = -EIO;
4386 goto err;
4387 }
4388
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004389 /* Ignore/skip any zero-length requests */
4390
4391 if (!length) {
4392 dout("%s: zero-length request\n", __func__);
4393 result = 0;
4394 goto err_rq;
4395 }
4396
Ilya Dryomov9568c932017-10-12 12:35:19 +02004397 rbd_assert(op_type == OBJ_OP_READ ||
4398 rbd_dev->spec->snap_id == CEPH_NOSNAP);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004399
4400 /*
4401 * Quit early if the mapped snapshot no longer exists. It's
4402 * still possible the snapshot will have disappeared by the
4403 * time our request arrives at the osd, but there's no sense in
4404 * sending it if we already know.
4405 */
4406 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
4407 dout("request for non-existent snapshot");
4408 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
4409 result = -ENXIO;
4410 goto err_rq;
4411 }
4412
4413 if (offset && length > U64_MAX - offset + 1) {
4414 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
4415 length);
4416 result = -EINVAL;
4417 goto err_rq; /* Shouldn't happen */
4418 }
4419
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004420 blk_mq_start_request(rq);
4421
Josh Durgin4e752f02014-04-08 11:12:11 -07004422 down_read(&rbd_dev->header_rwsem);
4423 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004424 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07004425 snapc = rbd_dev->header.snapc;
4426 ceph_get_snap_context(snapc);
4427 }
4428 up_read(&rbd_dev->header_rwsem);
4429
4430 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004431 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07004432 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004433 result = -EIO;
4434 goto err_rq;
4435 }
4436
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02004437 must_be_locked =
4438 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
4439 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004440 if (must_be_locked) {
4441 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004442 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED &&
Ilya Dryomove010dd02017-04-13 12:17:39 +02004443 !test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
4444 if (rbd_dev->opts->exclusive) {
4445 rbd_warn(rbd_dev, "exclusive lock required");
4446 result = -EROFS;
4447 goto err_unlock;
4448 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004449 rbd_wait_state_locked(rbd_dev);
Ilya Dryomove010dd02017-04-13 12:17:39 +02004450 }
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004451 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
4452 result = -EBLACKLISTED;
4453 goto err_unlock;
4454 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02004455 }
4456
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004457 img_request = rbd_img_request_create(rbd_dev, offset, length, op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07004458 snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004459 if (!img_request) {
4460 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004461 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004462 }
4463 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01004464 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004465
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004466 if (op_type == OBJ_OP_DISCARD)
4467 result = rbd_img_request_fill(img_request, OBJ_REQUEST_NODATA,
4468 NULL);
Ilya Dryomov5359a172018-01-20 10:30:10 +01004469 else {
4470 struct ceph_bio_iter bio_it = { .bio = rq->bio,
4471 .iter = rq->bio->bi_iter };
4472
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004473 result = rbd_img_request_fill(img_request, OBJ_REQUEST_BIO,
Ilya Dryomov5359a172018-01-20 10:30:10 +01004474 &bio_it);
4475 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004476 if (result)
4477 goto err_img_request;
4478
4479 result = rbd_img_request_submit(img_request);
4480 if (result)
4481 goto err_img_request;
4482
Ilya Dryomoved95b212016-08-12 16:40:02 +02004483 if (must_be_locked)
4484 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004485 return;
4486
4487err_img_request:
4488 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004489err_unlock:
4490 if (must_be_locked)
4491 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004492err_rq:
4493 if (result)
4494 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004495 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01004496 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004497err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02004498 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004499}
4500
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004501static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004502 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004503{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004504 struct request *rq = bd->rq;
4505 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004506
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004507 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004508 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004509}
4510
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004511static void rbd_free_disk(struct rbd_device *rbd_dev)
4512{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004513 blk_cleanup_queue(rbd_dev->disk->queue);
4514 blk_mq_free_tag_set(&rbd_dev->tag_set);
4515 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004516 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004517}
4518
Alex Elder788e2df2013-01-17 12:25:27 -06004519static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004520 struct ceph_object_id *oid,
4521 struct ceph_object_locator *oloc,
4522 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004523
4524{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004525 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4526 struct ceph_osd_request *req;
4527 struct page **pages;
4528 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004529 int ret;
4530
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004531 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4532 if (!req)
4533 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004534
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004535 ceph_oid_copy(&req->r_base_oid, oid);
4536 ceph_oloc_copy(&req->r_base_oloc, oloc);
4537 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004538
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004539 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
Alex Elder788e2df2013-01-17 12:25:27 -06004540 if (ret)
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004541 goto out_req;
Alex Elder788e2df2013-01-17 12:25:27 -06004542
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004543 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4544 if (IS_ERR(pages)) {
4545 ret = PTR_ERR(pages);
4546 goto out_req;
4547 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004548
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004549 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4550 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4551 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004552
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004553 ceph_osdc_start_request(osdc, req, false);
4554 ret = ceph_osdc_wait_request(osdc, req);
4555 if (ret >= 0)
4556 ceph_copy_from_page_vector(pages, buf, 0, ret);
4557
4558out_req:
4559 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004560 return ret;
4561}
4562
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004563/*
Alex Elder662518b2013-05-06 09:51:29 -05004564 * Read the complete header for the given rbd device. On successful
4565 * return, the rbd_dev->header field will contain up-to-date
4566 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004567 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004568static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004569{
4570 struct rbd_image_header_ondisk *ondisk = NULL;
4571 u32 snap_count = 0;
4572 u64 names_size = 0;
4573 u32 want_count;
4574 int ret;
4575
4576 /*
4577 * The complete header will include an array of its 64-bit
4578 * snapshot ids, followed by the names of those snapshots as
4579 * a contiguous block of NUL-terminated strings. Note that
4580 * the number of snapshots could change by the time we read
4581 * it in, in which case we re-read it.
4582 */
4583 do {
4584 size_t size;
4585
4586 kfree(ondisk);
4587
4588 size = sizeof (*ondisk);
4589 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4590 size += names_size;
4591 ondisk = kmalloc(size, GFP_KERNEL);
4592 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004593 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004594
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004595 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4596 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004597 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004598 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004599 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004600 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004601 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4602 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004603 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004604 }
4605 if (!rbd_dev_ondisk_valid(ondisk)) {
4606 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004607 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004608 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004609 }
4610
4611 names_size = le64_to_cpu(ondisk->snap_names_len);
4612 want_count = snap_count;
4613 snap_count = le32_to_cpu(ondisk->snap_count);
4614 } while (snap_count != want_count);
4615
Alex Elder662518b2013-05-06 09:51:29 -05004616 ret = rbd_header_from_disk(rbd_dev, ondisk);
4617out:
Alex Elder4156d992012-08-02 11:29:46 -05004618 kfree(ondisk);
4619
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004620 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004621}
4622
Alex Elder15228ed2013-05-01 12:43:03 -05004623/*
4624 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4625 * has disappeared from the (just updated) snapshot context.
4626 */
4627static void rbd_exists_validate(struct rbd_device *rbd_dev)
4628{
4629 u64 snap_id;
4630
4631 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4632 return;
4633
4634 snap_id = rbd_dev->spec->snap_id;
4635 if (snap_id == CEPH_NOSNAP)
4636 return;
4637
4638 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4639 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4640}
4641
Josh Durgin98752012013-08-29 17:26:31 -07004642static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4643{
4644 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004645
4646 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004647 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4648 * try to update its size. If REMOVING is set, updating size
4649 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004650 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004651 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4652 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004653 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4654 dout("setting size to %llu sectors", (unsigned long long)size);
4655 set_capacity(rbd_dev->disk, size);
4656 revalidate_disk(rbd_dev->disk);
4657 }
4658}
4659
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004660static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004661{
Alex Eldere627db02013-05-06 07:40:30 -05004662 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004663 int ret;
4664
Alex Eldercfbf6372013-05-31 17:40:45 -05004665 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004666 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004667
4668 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004669 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004670 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004671
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004672 /*
4673 * If there is a parent, see if it has disappeared due to the
4674 * mapped image getting flattened.
4675 */
4676 if (rbd_dev->parent) {
4677 ret = rbd_dev_v2_parent_info(rbd_dev);
4678 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004679 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004680 }
4681
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004682 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004683 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004684 } else {
4685 /* validate mapped snapshot's EXISTS flag */
4686 rbd_exists_validate(rbd_dev);
4687 }
Alex Elder15228ed2013-05-01 12:43:03 -05004688
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004689out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004690 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004691 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004692 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004693
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004694 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004695}
4696
Christoph Hellwigd6296d392017-05-01 10:19:08 -06004697static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4698 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004699{
4700 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4701
4702 INIT_WORK(work, rbd_queue_workfn);
4703 return 0;
4704}
4705
Eric Biggersf363b082017-03-30 13:39:16 -07004706static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004707 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004708 .init_request = rbd_init_request,
4709};
4710
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004711static int rbd_init_disk(struct rbd_device *rbd_dev)
4712{
4713 struct gendisk *disk;
4714 struct request_queue *q;
Alex Elder593a9e72012-02-07 12:03:37 -06004715 u64 segment_size;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004716 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004717
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004718 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004719 disk = alloc_disk(single_major ?
4720 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4721 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004722 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004723 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004724
Alex Elderf0f8cef2012-01-29 13:57:44 -06004725 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004726 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004727 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004728 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004729 if (single_major)
4730 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004731 disk->fops = &rbd_bd_ops;
4732 disk->private_data = rbd_dev;
4733
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004734 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4735 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004736 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004737 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004738 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004739 rbd_dev->tag_set.nr_hw_queues = 1;
4740 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4741
4742 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4743 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004744 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004745
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004746 q = blk_mq_init_queue(&rbd_dev->tag_set);
4747 if (IS_ERR(q)) {
4748 err = PTR_ERR(q);
4749 goto out_tag_set;
4750 }
4751
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004752 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
4753 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004754
Josh Durgin029bcbd2011-07-22 11:35:23 -07004755 /* set io sizes to object size */
Alex Elder593a9e72012-02-07 12:03:37 -06004756 segment_size = rbd_obj_bytes(&rbd_dev->header);
4757 blk_queue_max_hw_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004758 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004759 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004760 blk_queue_max_segment_size(q, UINT_MAX);
Alex Elder593a9e72012-02-07 12:03:37 -06004761 blk_queue_io_min(q, segment_size);
4762 blk_queue_io_opt(q, segment_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004763
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004764 /* enable the discard support */
4765 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
4766 q->limits.discard_granularity = segment_size;
Jens Axboe2bb4cd52015-07-14 08:15:12 -06004767 blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE);
Ilya Dryomov6ac56952017-05-22 19:59:24 +02004768 blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004769
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004770 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004771 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004772
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004773 /*
4774 * disk_release() expects a queue ref from add_disk() and will
4775 * put it. Hold an extra ref until add_disk() is called.
4776 */
4777 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004778 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004779 q->queuedata = rbd_dev;
4780
4781 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004782
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004783 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004784out_tag_set:
4785 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004786out_disk:
4787 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004788 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004789}
4790
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004791/*
4792 sysfs
4793*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004794
Alex Elder593a9e72012-02-07 12:03:37 -06004795static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4796{
4797 return container_of(dev, struct rbd_device, dev);
4798}
4799
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004800static ssize_t rbd_size_show(struct device *dev,
4801 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004802{
Alex Elder593a9e72012-02-07 12:03:37 -06004803 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004804
Alex Elderfc71d832013-04-26 15:44:36 -05004805 return sprintf(buf, "%llu\n",
4806 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004807}
4808
Alex Elder34b13182012-07-13 20:35:12 -05004809/*
4810 * Note this shows the features for whatever's mapped, which is not
4811 * necessarily the base image.
4812 */
4813static ssize_t rbd_features_show(struct device *dev,
4814 struct device_attribute *attr, char *buf)
4815{
4816 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4817
4818 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004819 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004820}
4821
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004822static ssize_t rbd_major_show(struct device *dev,
4823 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004824{
Alex Elder593a9e72012-02-07 12:03:37 -06004825 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004826
Alex Elderfc71d832013-04-26 15:44:36 -05004827 if (rbd_dev->major)
4828 return sprintf(buf, "%d\n", rbd_dev->major);
4829
4830 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004831}
Alex Elderfc71d832013-04-26 15:44:36 -05004832
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004833static ssize_t rbd_minor_show(struct device *dev,
4834 struct device_attribute *attr, char *buf)
4835{
4836 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4837
4838 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004839}
4840
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004841static ssize_t rbd_client_addr_show(struct device *dev,
4842 struct device_attribute *attr, char *buf)
4843{
4844 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4845 struct ceph_entity_addr *client_addr =
4846 ceph_client_addr(rbd_dev->rbd_client->client);
4847
4848 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4849 le32_to_cpu(client_addr->nonce));
4850}
4851
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004852static ssize_t rbd_client_id_show(struct device *dev,
4853 struct device_attribute *attr, char *buf)
4854{
Alex Elder593a9e72012-02-07 12:03:37 -06004855 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004856
Alex Elder1dbb4392012-01-24 10:08:37 -06004857 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004858 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004859}
4860
Mike Christie267fb902016-08-18 18:38:43 +02004861static ssize_t rbd_cluster_fsid_show(struct device *dev,
4862 struct device_attribute *attr, char *buf)
4863{
4864 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4865
4866 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4867}
4868
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004869static ssize_t rbd_config_info_show(struct device *dev,
4870 struct device_attribute *attr, char *buf)
4871{
4872 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4873
4874 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004875}
4876
4877static ssize_t rbd_pool_show(struct device *dev,
4878 struct device_attribute *attr, char *buf)
4879{
Alex Elder593a9e72012-02-07 12:03:37 -06004880 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004881
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004882 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004883}
4884
Alex Elder9bb2f332012-07-12 10:46:35 -05004885static ssize_t rbd_pool_id_show(struct device *dev,
4886 struct device_attribute *attr, char *buf)
4887{
4888 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4889
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004890 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004891 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004892}
4893
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004894static ssize_t rbd_name_show(struct device *dev,
4895 struct device_attribute *attr, char *buf)
4896{
Alex Elder593a9e72012-02-07 12:03:37 -06004897 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004898
Alex Eldera92ffdf2012-10-30 19:40:33 -05004899 if (rbd_dev->spec->image_name)
4900 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4901
4902 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004903}
4904
Alex Elder589d30e2012-07-10 20:30:11 -05004905static ssize_t rbd_image_id_show(struct device *dev,
4906 struct device_attribute *attr, char *buf)
4907{
4908 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4909
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004910 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004911}
4912
Alex Elder34b13182012-07-13 20:35:12 -05004913/*
4914 * Shows the name of the currently-mapped snapshot (or
4915 * RBD_SNAP_HEAD_NAME for the base image).
4916 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004917static ssize_t rbd_snap_show(struct device *dev,
4918 struct device_attribute *attr,
4919 char *buf)
4920{
Alex Elder593a9e72012-02-07 12:03:37 -06004921 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004922
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004923 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004924}
4925
Mike Christie92a58672016-08-18 18:38:44 +02004926static ssize_t rbd_snap_id_show(struct device *dev,
4927 struct device_attribute *attr, char *buf)
4928{
4929 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4930
4931 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4932}
4933
Alex Elder86b00e02012-10-25 23:34:42 -05004934/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004935 * For a v2 image, shows the chain of parent images, separated by empty
4936 * lines. For v1 images or if there is no parent, shows "(no parent
4937 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004938 */
4939static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004940 struct device_attribute *attr,
4941 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004942{
4943 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004944 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004945
Ilya Dryomovff961282014-07-22 21:53:07 +04004946 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004947 return sprintf(buf, "(no parent image)\n");
4948
Ilya Dryomovff961282014-07-22 21:53:07 +04004949 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4950 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004951
Ilya Dryomovff961282014-07-22 21:53:07 +04004952 count += sprintf(&buf[count], "%s"
4953 "pool_id %llu\npool_name %s\n"
4954 "image_id %s\nimage_name %s\n"
4955 "snap_id %llu\nsnap_name %s\n"
4956 "overlap %llu\n",
4957 !count ? "" : "\n", /* first? */
4958 spec->pool_id, spec->pool_name,
4959 spec->image_id, spec->image_name ?: "(unknown)",
4960 spec->snap_id, spec->snap_name,
4961 rbd_dev->parent_overlap);
4962 }
Alex Elder86b00e02012-10-25 23:34:42 -05004963
Ilya Dryomovff961282014-07-22 21:53:07 +04004964 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004965}
4966
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004967static ssize_t rbd_image_refresh(struct device *dev,
4968 struct device_attribute *attr,
4969 const char *buf,
4970 size_t size)
4971{
Alex Elder593a9e72012-02-07 12:03:37 -06004972 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004973 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004974
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004975 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004976 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004977 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004978
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004979 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004980}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004981
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004982static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL);
Alex Elder34b13182012-07-13 20:35:12 -05004983static DEVICE_ATTR(features, S_IRUGO, rbd_features_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004984static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL);
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004985static DEVICE_ATTR(minor, S_IRUGO, rbd_minor_show, NULL);
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004986static DEVICE_ATTR(client_addr, S_IRUGO, rbd_client_addr_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004987static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL);
Mike Christie267fb902016-08-18 18:38:43 +02004988static DEVICE_ATTR(cluster_fsid, S_IRUGO, rbd_cluster_fsid_show, NULL);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004989static DEVICE_ATTR(config_info, S_IRUSR, rbd_config_info_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004990static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL);
Alex Elder9bb2f332012-07-12 10:46:35 -05004991static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004992static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL);
Alex Elder589d30e2012-07-10 20:30:11 -05004993static DEVICE_ATTR(image_id, S_IRUGO, rbd_image_id_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004994static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh);
4995static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL);
Mike Christie92a58672016-08-18 18:38:44 +02004996static DEVICE_ATTR(snap_id, S_IRUGO, rbd_snap_id_show, NULL);
Alex Elder86b00e02012-10-25 23:34:42 -05004997static DEVICE_ATTR(parent, S_IRUGO, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004998
4999static struct attribute *rbd_attrs[] = {
5000 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05005001 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005002 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02005003 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02005004 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005005 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02005006 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005007 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005008 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05005009 &dev_attr_pool_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005010 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05005011 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005012 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02005013 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05005014 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005015 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005016 NULL
5017};
5018
5019static struct attribute_group rbd_attr_group = {
5020 .attrs = rbd_attrs,
5021};
5022
5023static const struct attribute_group *rbd_attr_groups[] = {
5024 &rbd_attr_group,
5025 NULL
5026};
5027
Ilya Dryomov6cac4692015-10-16 20:11:25 +02005028static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005029
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05305030static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005031 .name = "rbd",
5032 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02005033 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005034};
5035
Alex Elder8b8fb992012-10-26 17:25:24 -05005036static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
5037{
5038 kref_get(&spec->kref);
5039
5040 return spec;
5041}
5042
5043static void rbd_spec_free(struct kref *kref);
5044static void rbd_spec_put(struct rbd_spec *spec)
5045{
5046 if (spec)
5047 kref_put(&spec->kref, rbd_spec_free);
5048}
5049
5050static struct rbd_spec *rbd_spec_alloc(void)
5051{
5052 struct rbd_spec *spec;
5053
5054 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
5055 if (!spec)
5056 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04005057
5058 spec->pool_id = CEPH_NOPOOL;
5059 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05005060 kref_init(&spec->kref);
5061
Alex Elder8b8fb992012-10-26 17:25:24 -05005062 return spec;
5063}
5064
5065static void rbd_spec_free(struct kref *kref)
5066{
5067 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
5068
5069 kfree(spec->pool_name);
5070 kfree(spec->image_id);
5071 kfree(spec->image_name);
5072 kfree(spec->snap_name);
5073 kfree(spec);
5074}
5075
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005076static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005077{
Ilya Dryomov99d16942016-08-12 16:11:41 +02005078 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02005079 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005080
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005081 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02005082 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02005083 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005084
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005085 rbd_put_client(rbd_dev->rbd_client);
5086 rbd_spec_put(rbd_dev->spec);
5087 kfree(rbd_dev->opts);
5088 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005089}
5090
5091static void rbd_dev_release(struct device *dev)
5092{
5093 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
5094 bool need_put = !!rbd_dev->opts;
5095
5096 if (need_put) {
5097 destroy_workqueue(rbd_dev->task_wq);
5098 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5099 }
5100
5101 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005102
5103 /*
5104 * This is racy, but way better than putting module outside of
5105 * the release callback. The race window is pretty small, so
5106 * doing something similar to dm (dm-builtin.c) is overkill.
5107 */
5108 if (need_put)
5109 module_put(THIS_MODULE);
5110}
5111
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005112static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
5113 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05005114{
5115 struct rbd_device *rbd_dev;
5116
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005117 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05005118 if (!rbd_dev)
5119 return NULL;
5120
5121 spin_lock_init(&rbd_dev->lock);
5122 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05005123 init_rwsem(&rbd_dev->header_rwsem);
5124
Ilya Dryomov7e973322017-01-25 18:16:22 +01005125 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005126 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01005127 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02005128
Ilya Dryomov99d16942016-08-12 16:11:41 +02005129 mutex_init(&rbd_dev->watch_mutex);
5130 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
5131 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
5132
Ilya Dryomoved95b212016-08-12 16:40:02 +02005133 init_rwsem(&rbd_dev->lock_rwsem);
5134 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
5135 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
5136 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
5137 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
5138 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
5139 init_waitqueue_head(&rbd_dev->lock_waitq);
5140
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005141 rbd_dev->dev.bus = &rbd_bus_type;
5142 rbd_dev->dev.type = &rbd_device_type;
5143 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005144 device_initialize(&rbd_dev->dev);
5145
Alex Elderc53d5892012-10-25 23:34:42 -05005146 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03005147 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06005148
Alex Elderc53d5892012-10-25 23:34:42 -05005149 return rbd_dev;
5150}
5151
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005152/*
5153 * Create a mapping rbd_dev.
5154 */
5155static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
5156 struct rbd_spec *spec,
5157 struct rbd_options *opts)
5158{
5159 struct rbd_device *rbd_dev;
5160
5161 rbd_dev = __rbd_dev_create(rbdc, spec);
5162 if (!rbd_dev)
5163 return NULL;
5164
5165 rbd_dev->opts = opts;
5166
5167 /* get an id and fill in device name */
5168 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
5169 minor_to_rbd_dev_id(1 << MINORBITS),
5170 GFP_KERNEL);
5171 if (rbd_dev->dev_id < 0)
5172 goto fail_rbd_dev;
5173
5174 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
5175 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
5176 rbd_dev->name);
5177 if (!rbd_dev->task_wq)
5178 goto fail_dev_id;
5179
5180 /* we have a ref from do_rbd_add() */
5181 __module_get(THIS_MODULE);
5182
5183 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
5184 return rbd_dev;
5185
5186fail_dev_id:
5187 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
5188fail_rbd_dev:
5189 rbd_dev_free(rbd_dev);
5190 return NULL;
5191}
5192
Alex Elderc53d5892012-10-25 23:34:42 -05005193static void rbd_dev_destroy(struct rbd_device *rbd_dev)
5194{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02005195 if (rbd_dev)
5196 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05005197}
5198
Yehuda Sadehdfc56062010-11-19 14:51:04 -08005199/*
Alex Elder9d475de2012-07-03 16:01:19 -05005200 * Get the size and object order for an image snapshot, or if
5201 * snap_id is CEPH_NOSNAP, gets this information for the base
5202 * image.
5203 */
5204static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
5205 u8 *order, u64 *snap_size)
5206{
5207 __le64 snapid = cpu_to_le64(snap_id);
5208 int ret;
5209 struct {
5210 u8 order;
5211 __le64 size;
5212 } __attribute__ ((packed)) size_buf = { 0 };
5213
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005214 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5215 &rbd_dev->header_oloc, "get_size",
5216 &snapid, sizeof(snapid),
5217 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06005218 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05005219 if (ret < 0)
5220 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05005221 if (ret < sizeof (size_buf))
5222 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05005223
Josh Durginc3545572013-08-28 17:08:10 -07005224 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05005225 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07005226 dout(" order %u", (unsigned int)*order);
5227 }
Alex Elder9d475de2012-07-03 16:01:19 -05005228 *snap_size = le64_to_cpu(size_buf.size);
5229
Josh Durginc3545572013-08-28 17:08:10 -07005230 dout(" snap_id 0x%016llx snap_size = %llu\n",
5231 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05005232 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05005233
5234 return 0;
5235}
5236
5237static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
5238{
5239 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
5240 &rbd_dev->header.obj_order,
5241 &rbd_dev->header.image_size);
5242}
5243
Alex Elder1e130192012-07-03 16:01:19 -05005244static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
5245{
5246 void *reply_buf;
5247 int ret;
5248 void *p;
5249
5250 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
5251 if (!reply_buf)
5252 return -ENOMEM;
5253
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005254 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5255 &rbd_dev->header_oloc, "get_object_prefix",
5256 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005257 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05005258 if (ret < 0)
5259 goto out;
5260
5261 p = reply_buf;
5262 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05005263 p + ret, NULL, GFP_NOIO);
5264 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005265
5266 if (IS_ERR(rbd_dev->header.object_prefix)) {
5267 ret = PTR_ERR(rbd_dev->header.object_prefix);
5268 rbd_dev->header.object_prefix = NULL;
5269 } else {
5270 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
5271 }
Alex Elder1e130192012-07-03 16:01:19 -05005272out:
5273 kfree(reply_buf);
5274
5275 return ret;
5276}
5277
Alex Elderb1b54022012-07-03 16:01:19 -05005278static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
5279 u64 *snap_features)
5280{
5281 __le64 snapid = cpu_to_le64(snap_id);
5282 struct {
5283 __le64 features;
5284 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05005285 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005286 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05005287 int ret;
5288
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005289 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5290 &rbd_dev->header_oloc, "get_features",
5291 &snapid, sizeof(snapid),
5292 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06005293 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05005294 if (ret < 0)
5295 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05005296 if (ret < sizeof (features_buf))
5297 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07005298
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005299 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
5300 if (unsup) {
5301 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
5302 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05005303 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02005304 }
Alex Elderd8891402012-10-09 13:50:17 -07005305
Alex Elderb1b54022012-07-03 16:01:19 -05005306 *snap_features = le64_to_cpu(features_buf.features);
5307
5308 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05005309 (unsigned long long)snap_id,
5310 (unsigned long long)*snap_features,
5311 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05005312
5313 return 0;
5314}
5315
5316static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
5317{
5318 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
5319 &rbd_dev->header.features);
5320}
5321
Alex Elder86b00e02012-10-25 23:34:42 -05005322static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5323{
5324 struct rbd_spec *parent_spec;
5325 size_t size;
5326 void *reply_buf = NULL;
5327 __le64 snapid;
5328 void *p;
5329 void *end;
Alex Elder642a2532013-05-06 17:40:33 -05005330 u64 pool_id;
Alex Elder86b00e02012-10-25 23:34:42 -05005331 char *image_id;
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005332 u64 snap_id;
Alex Elder86b00e02012-10-25 23:34:42 -05005333 u64 overlap;
Alex Elder86b00e02012-10-25 23:34:42 -05005334 int ret;
5335
5336 parent_spec = rbd_spec_alloc();
5337 if (!parent_spec)
5338 return -ENOMEM;
5339
5340 size = sizeof (__le64) + /* pool_id */
5341 sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX + /* image_id */
5342 sizeof (__le64) + /* snap_id */
5343 sizeof (__le64); /* overlap */
5344 reply_buf = kmalloc(size, GFP_KERNEL);
5345 if (!reply_buf) {
5346 ret = -ENOMEM;
5347 goto out_err;
5348 }
5349
Ilya Dryomov4d9b67c2014-07-24 10:42:13 +04005350 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005351 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5352 &rbd_dev->header_oloc, "get_parent",
5353 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005354 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder86b00e02012-10-25 23:34:42 -05005355 if (ret < 0)
5356 goto out_err;
5357
Alex Elder86b00e02012-10-25 23:34:42 -05005358 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005359 end = reply_buf + ret;
5360 ret = -ERANGE;
Alex Elder642a2532013-05-06 17:40:33 -05005361 ceph_decode_64_safe(&p, end, pool_id, out_err);
Alex Elder392a9da2013-05-06 17:40:33 -05005362 if (pool_id == CEPH_NOPOOL) {
5363 /*
5364 * Either the parent never existed, or we have
5365 * record of it but the image got flattened so it no
5366 * longer has a parent. When the parent of a
5367 * layered image disappears we immediately set the
5368 * overlap to 0. The effect of this is that all new
5369 * requests will be treated as if the image had no
5370 * parent.
5371 */
5372 if (rbd_dev->parent_overlap) {
5373 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005374 rbd_dev_parent_put(rbd_dev);
5375 pr_info("%s: clone image has been flattened\n",
5376 rbd_dev->disk->disk_name);
5377 }
5378
Alex Elder86b00e02012-10-25 23:34:42 -05005379 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005380 }
Alex Elder86b00e02012-10-25 23:34:42 -05005381
Alex Elder0903e872012-11-14 12:25:19 -06005382 /* The ceph file layout needs to fit pool id in 32 bits */
5383
5384 ret = -EIO;
Alex Elder642a2532013-05-06 17:40:33 -05005385 if (pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005386 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Alex Elder642a2532013-05-06 17:40:33 -05005387 (unsigned long long)pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005388 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005389 }
Alex Elder0903e872012-11-14 12:25:19 -06005390
Alex Elder979ed482012-11-01 08:39:26 -05005391 image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elder86b00e02012-10-25 23:34:42 -05005392 if (IS_ERR(image_id)) {
5393 ret = PTR_ERR(image_id);
5394 goto out_err;
5395 }
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005396 ceph_decode_64_safe(&p, end, snap_id, out_err);
Alex Elder86b00e02012-10-25 23:34:42 -05005397 ceph_decode_64_safe(&p, end, overlap, out_err);
5398
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005399 /*
5400 * The parent won't change (except when the clone is
5401 * flattened, already handled that). So we only need to
5402 * record the parent spec we have not already done so.
5403 */
5404 if (!rbd_dev->parent_spec) {
5405 parent_spec->pool_id = pool_id;
5406 parent_spec->image_id = image_id;
5407 parent_spec->snap_id = snap_id;
Alex Elder70cf49c2013-05-06 17:40:33 -05005408 rbd_dev->parent_spec = parent_spec;
5409 parent_spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovfbba11b2014-06-27 21:46:33 +04005410 } else {
5411 kfree(image_id);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005412 }
5413
5414 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005415 * We always update the parent overlap. If it's zero we issue
5416 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005417 */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005418 if (!overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005419 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005420 /* refresh, careful to warn just once */
5421 if (rbd_dev->parent_overlap)
5422 rbd_warn(rbd_dev,
5423 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005424 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005425 /* initial probe */
5426 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005427 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005428 }
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005429 rbd_dev->parent_overlap = overlap;
5430
Alex Elder86b00e02012-10-25 23:34:42 -05005431out:
5432 ret = 0;
5433out_err:
5434 kfree(reply_buf);
5435 rbd_spec_put(parent_spec);
5436
5437 return ret;
5438}
5439
Alex Eldercc070d52013-04-21 12:14:45 -05005440static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5441{
5442 struct {
5443 __le64 stripe_unit;
5444 __le64 stripe_count;
5445 } __attribute__ ((packed)) striping_info_buf = { 0 };
5446 size_t size = sizeof (striping_info_buf);
5447 void *p;
5448 u64 obj_size;
5449 u64 stripe_unit;
5450 u64 stripe_count;
5451 int ret;
5452
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005453 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5454 &rbd_dev->header_oloc, "get_stripe_unit_count",
5455 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005456 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5457 if (ret < 0)
5458 return ret;
5459 if (ret < size)
5460 return -ERANGE;
5461
5462 /*
5463 * We don't actually support the "fancy striping" feature
5464 * (STRIPINGV2) yet, but if the striping sizes are the
5465 * defaults the behavior is the same as before. So find
5466 * out, and only fail if the image has non-default values.
5467 */
5468 ret = -EINVAL;
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01005469 obj_size = rbd_obj_bytes(&rbd_dev->header);
Alex Eldercc070d52013-04-21 12:14:45 -05005470 p = &striping_info_buf;
5471 stripe_unit = ceph_decode_64(&p);
5472 if (stripe_unit != obj_size) {
5473 rbd_warn(rbd_dev, "unsupported stripe unit "
5474 "(got %llu want %llu)",
5475 stripe_unit, obj_size);
5476 return -EINVAL;
5477 }
5478 stripe_count = ceph_decode_64(&p);
5479 if (stripe_count != 1) {
5480 rbd_warn(rbd_dev, "unsupported stripe count "
5481 "(got %llu want 1)", stripe_count);
5482 return -EINVAL;
5483 }
Alex Elder500d0c02013-04-26 09:43:47 -05005484 rbd_dev->header.stripe_unit = stripe_unit;
5485 rbd_dev->header.stripe_count = stripe_count;
Alex Eldercc070d52013-04-21 12:14:45 -05005486
5487 return 0;
5488}
5489
Ilya Dryomov7e973322017-01-25 18:16:22 +01005490static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5491{
5492 __le64 data_pool_id;
5493 int ret;
5494
5495 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5496 &rbd_dev->header_oloc, "get_data_pool",
5497 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5498 if (ret < 0)
5499 return ret;
5500 if (ret < sizeof(data_pool_id))
5501 return -EBADMSG;
5502
5503 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5504 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5505 return 0;
5506}
5507
Alex Elder9e15b772012-10-30 19:40:33 -05005508static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5509{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005510 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005511 size_t image_id_size;
5512 char *image_id;
5513 void *p;
5514 void *end;
5515 size_t size;
5516 void *reply_buf = NULL;
5517 size_t len = 0;
5518 char *image_name = NULL;
5519 int ret;
5520
5521 rbd_assert(!rbd_dev->spec->image_name);
5522
Alex Elder69e7a022012-11-01 08:39:26 -05005523 len = strlen(rbd_dev->spec->image_id);
5524 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005525 image_id = kmalloc(image_id_size, GFP_KERNEL);
5526 if (!image_id)
5527 return NULL;
5528
5529 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005530 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005531 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005532
5533 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5534 reply_buf = kmalloc(size, GFP_KERNEL);
5535 if (!reply_buf)
5536 goto out;
5537
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005538 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5539 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5540 "dir_get_name", image_id, image_id_size,
5541 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005542 if (ret < 0)
5543 goto out;
5544 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005545 end = reply_buf + ret;
5546
Alex Elder9e15b772012-10-30 19:40:33 -05005547 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5548 if (IS_ERR(image_name))
5549 image_name = NULL;
5550 else
5551 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5552out:
5553 kfree(reply_buf);
5554 kfree(image_id);
5555
5556 return image_name;
5557}
5558
Alex Elder2ad3d712013-04-30 00:44:33 -05005559static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5560{
5561 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5562 const char *snap_name;
5563 u32 which = 0;
5564
5565 /* Skip over names until we find the one we are looking for */
5566
5567 snap_name = rbd_dev->header.snap_names;
5568 while (which < snapc->num_snaps) {
5569 if (!strcmp(name, snap_name))
5570 return snapc->snaps[which];
5571 snap_name += strlen(snap_name) + 1;
5572 which++;
5573 }
5574 return CEPH_NOSNAP;
5575}
5576
5577static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5578{
5579 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5580 u32 which;
5581 bool found = false;
5582 u64 snap_id;
5583
5584 for (which = 0; !found && which < snapc->num_snaps; which++) {
5585 const char *snap_name;
5586
5587 snap_id = snapc->snaps[which];
5588 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005589 if (IS_ERR(snap_name)) {
5590 /* ignore no-longer existing snapshots */
5591 if (PTR_ERR(snap_name) == -ENOENT)
5592 continue;
5593 else
5594 break;
5595 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005596 found = !strcmp(name, snap_name);
5597 kfree(snap_name);
5598 }
5599 return found ? snap_id : CEPH_NOSNAP;
5600}
5601
5602/*
5603 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5604 * no snapshot by that name is found, or if an error occurs.
5605 */
5606static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5607{
5608 if (rbd_dev->image_format == 1)
5609 return rbd_v1_snap_id_by_name(rbd_dev, name);
5610
5611 return rbd_v2_snap_id_by_name(rbd_dev, name);
5612}
5613
Alex Elder9e15b772012-10-30 19:40:33 -05005614/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005615 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005616 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005617static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5618{
5619 struct rbd_spec *spec = rbd_dev->spec;
5620
5621 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5622 rbd_assert(spec->image_id && spec->image_name);
5623 rbd_assert(spec->snap_name);
5624
5625 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5626 u64 snap_id;
5627
5628 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5629 if (snap_id == CEPH_NOSNAP)
5630 return -ENOENT;
5631
5632 spec->snap_id = snap_id;
5633 } else {
5634 spec->snap_id = CEPH_NOSNAP;
5635 }
5636
5637 return 0;
5638}
5639
5640/*
5641 * A parent image will have all ids but none of the names.
5642 *
5643 * All names in an rbd spec are dynamically allocated. It's OK if we
5644 * can't figure out the name for an image id.
5645 */
5646static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005647{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005648 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5649 struct rbd_spec *spec = rbd_dev->spec;
5650 const char *pool_name;
5651 const char *image_name;
5652 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005653 int ret;
5654
Ilya Dryomov04077592014-07-23 17:11:20 +04005655 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5656 rbd_assert(spec->image_id);
5657 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005658
Alex Elder2e9f7f12013-04-26 09:43:48 -05005659 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005660
Alex Elder2e9f7f12013-04-26 09:43:48 -05005661 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5662 if (!pool_name) {
5663 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005664 return -EIO;
5665 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005666 pool_name = kstrdup(pool_name, GFP_KERNEL);
5667 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005668 return -ENOMEM;
5669
5670 /* Fetch the image name; tolerate failure here */
5671
Alex Elder2e9f7f12013-04-26 09:43:48 -05005672 image_name = rbd_dev_image_name(rbd_dev);
5673 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005674 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005675
Ilya Dryomov04077592014-07-23 17:11:20 +04005676 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005677
Alex Elder2e9f7f12013-04-26 09:43:48 -05005678 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005679 if (IS_ERR(snap_name)) {
5680 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005681 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005682 }
5683
5684 spec->pool_name = pool_name;
5685 spec->image_name = image_name;
5686 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005687
5688 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005689
Alex Elder9e15b772012-10-30 19:40:33 -05005690out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005691 kfree(image_name);
5692 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005693 return ret;
5694}
5695
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005696static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005697{
5698 size_t size;
5699 int ret;
5700 void *reply_buf;
5701 void *p;
5702 void *end;
5703 u64 seq;
5704 u32 snap_count;
5705 struct ceph_snap_context *snapc;
5706 u32 i;
5707
5708 /*
5709 * We'll need room for the seq value (maximum snapshot id),
5710 * snapshot count, and array of that many snapshot ids.
5711 * For now we have a fixed upper limit on the number we're
5712 * prepared to receive.
5713 */
5714 size = sizeof (__le64) + sizeof (__le32) +
5715 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5716 reply_buf = kzalloc(size, GFP_KERNEL);
5717 if (!reply_buf)
5718 return -ENOMEM;
5719
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005720 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5721 &rbd_dev->header_oloc, "get_snapcontext",
5722 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005723 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005724 if (ret < 0)
5725 goto out;
5726
Alex Elder35d489f2012-07-03 16:01:19 -05005727 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005728 end = reply_buf + ret;
5729 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005730 ceph_decode_64_safe(&p, end, seq, out);
5731 ceph_decode_32_safe(&p, end, snap_count, out);
5732
5733 /*
5734 * Make sure the reported number of snapshot ids wouldn't go
5735 * beyond the end of our buffer. But before checking that,
5736 * make sure the computed size of the snapshot context we
5737 * allocate is representable in a size_t.
5738 */
5739 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5740 / sizeof (u64)) {
5741 ret = -EINVAL;
5742 goto out;
5743 }
5744 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5745 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005746 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005747
Alex Elder812164f82013-04-30 00:44:32 -05005748 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005749 if (!snapc) {
5750 ret = -ENOMEM;
5751 goto out;
5752 }
Alex Elder35d489f2012-07-03 16:01:19 -05005753 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005754 for (i = 0; i < snap_count; i++)
5755 snapc->snaps[i] = ceph_decode_64(&p);
5756
Alex Elder49ece552013-05-06 08:37:00 -05005757 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005758 rbd_dev->header.snapc = snapc;
5759
5760 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005761 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005762out:
5763 kfree(reply_buf);
5764
Alex Elder57385b52013-04-21 12:14:45 -05005765 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005766}
5767
Alex Elder54cac612013-04-30 00:44:33 -05005768static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5769 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005770{
5771 size_t size;
5772 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005773 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005774 int ret;
5775 void *p;
5776 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005777 char *snap_name;
5778
5779 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5780 reply_buf = kmalloc(size, GFP_KERNEL);
5781 if (!reply_buf)
5782 return ERR_PTR(-ENOMEM);
5783
Alex Elder54cac612013-04-30 00:44:33 -05005784 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005785 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5786 &rbd_dev->header_oloc, "get_snapshot_name",
5787 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005788 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005789 if (ret < 0) {
5790 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005791 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005792 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005793
5794 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005795 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005796 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005797 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005798 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005799
Alex Elderf40eb342013-04-25 15:09:42 -05005800 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005801 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005802out:
5803 kfree(reply_buf);
5804
Alex Elderf40eb342013-04-25 15:09:42 -05005805 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005806}
5807
Alex Elder2df3fac2013-05-06 09:51:30 -05005808static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005809{
Alex Elder2df3fac2013-05-06 09:51:30 -05005810 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005811 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005812
Josh Durgin1617e402013-06-12 14:43:10 -07005813 ret = rbd_dev_v2_image_size(rbd_dev);
5814 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005815 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005816
Alex Elder2df3fac2013-05-06 09:51:30 -05005817 if (first_time) {
5818 ret = rbd_dev_v2_header_onetime(rbd_dev);
5819 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005820 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005821 }
5822
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005823 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005824 if (ret && first_time) {
5825 kfree(rbd_dev->header.object_prefix);
5826 rbd_dev->header.object_prefix = NULL;
5827 }
Alex Elder117973f2012-08-31 17:29:55 -05005828
5829 return ret;
5830}
5831
Ilya Dryomova720ae02014-07-23 17:11:19 +04005832static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5833{
5834 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5835
5836 if (rbd_dev->image_format == 1)
5837 return rbd_dev_v1_header_info(rbd_dev);
5838
5839 return rbd_dev_v2_header_info(rbd_dev);
5840}
5841
Alex Elder1ddbe942012-01-29 13:57:44 -06005842/*
Alex Eldere28fff262012-02-02 08:13:30 -06005843 * Skips over white space at *buf, and updates *buf to point to the
5844 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005845 * the token (string of non-white space characters) found. Note
5846 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005847 */
5848static inline size_t next_token(const char **buf)
5849{
5850 /*
5851 * These are the characters that produce nonzero for
5852 * isspace() in the "C" and "POSIX" locales.
5853 */
5854 const char *spaces = " \f\n\r\t\v";
5855
5856 *buf += strspn(*buf, spaces); /* Find start of token */
5857
5858 return strcspn(*buf, spaces); /* Return token length */
5859}
5860
5861/*
Alex Elderea3352f2012-07-09 21:04:23 -05005862 * Finds the next token in *buf, dynamically allocates a buffer big
5863 * enough to hold a copy of it, and copies the token into the new
5864 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5865 * that a duplicate buffer is created even for a zero-length token.
5866 *
5867 * Returns a pointer to the newly-allocated duplicate, or a null
5868 * pointer if memory for the duplicate was not available. If
5869 * the lenp argument is a non-null pointer, the length of the token
5870 * (not including the '\0') is returned in *lenp.
5871 *
5872 * If successful, the *buf pointer will be updated to point beyond
5873 * the end of the found token.
5874 *
5875 * Note: uses GFP_KERNEL for allocation.
5876 */
5877static inline char *dup_token(const char **buf, size_t *lenp)
5878{
5879 char *dup;
5880 size_t len;
5881
5882 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005883 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005884 if (!dup)
5885 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005886 *(dup + len) = '\0';
5887 *buf += len;
5888
5889 if (lenp)
5890 *lenp = len;
5891
5892 return dup;
5893}
5894
5895/*
Alex Elder859c31d2012-10-25 23:34:42 -05005896 * Parse the options provided for an "rbd add" (i.e., rbd image
5897 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5898 * and the data written is passed here via a NUL-terminated buffer.
5899 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005900 *
Alex Elder859c31d2012-10-25 23:34:42 -05005901 * The information extracted from these options is recorded in
5902 * the other parameters which return dynamically-allocated
5903 * structures:
5904 * ceph_opts
5905 * The address of a pointer that will refer to a ceph options
5906 * structure. Caller must release the returned pointer using
5907 * ceph_destroy_options() when it is no longer needed.
5908 * rbd_opts
5909 * Address of an rbd options pointer. Fully initialized by
5910 * this function; caller must release with kfree().
5911 * spec
5912 * Address of an rbd image specification pointer. Fully
5913 * initialized by this function based on parsed options.
5914 * Caller must release with rbd_spec_put().
5915 *
5916 * The options passed take this form:
5917 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5918 * where:
5919 * <mon_addrs>
5920 * A comma-separated list of one or more monitor addresses.
5921 * A monitor address is an ip address, optionally followed
5922 * by a port number (separated by a colon).
5923 * I.e.: ip1[:port1][,ip2[:port2]...]
5924 * <options>
5925 * A comma-separated list of ceph and/or rbd options.
5926 * <pool_name>
5927 * The name of the rados pool containing the rbd image.
5928 * <image_name>
5929 * The name of the image in that pool to map.
5930 * <snap_id>
5931 * An optional snapshot id. If provided, the mapping will
5932 * present data from the image at the time that snapshot was
5933 * created. The image head is used if no snapshot id is
5934 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005935 */
Alex Elder859c31d2012-10-25 23:34:42 -05005936static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005937 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005938 struct rbd_options **opts,
5939 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005940{
Alex Elderd22f76e2012-07-12 10:46:35 -05005941 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005942 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005943 const char *mon_addrs;
Alex Elderecb4dc22013-04-26 09:43:47 -05005944 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005945 size_t mon_addrs_size;
Alex Elder859c31d2012-10-25 23:34:42 -05005946 struct rbd_spec *spec = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05005947 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05005948 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005949 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005950
5951 /* The first four tokens are required */
5952
Alex Elder7ef32142012-02-02 08:13:30 -06005953 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005954 if (!len) {
5955 rbd_warn(NULL, "no monitor address(es) provided");
5956 return -EINVAL;
5957 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005958 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005959 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005960 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005961
Alex Elderdc79b112012-10-25 23:34:41 -05005962 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005963 options = dup_token(&buf, NULL);
5964 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005965 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005966 if (!*options) {
5967 rbd_warn(NULL, "no options provided");
5968 goto out_err;
5969 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005970
Alex Elder859c31d2012-10-25 23:34:42 -05005971 spec = rbd_spec_alloc();
5972 if (!spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005973 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005974
5975 spec->pool_name = dup_token(&buf, NULL);
5976 if (!spec->pool_name)
5977 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005978 if (!*spec->pool_name) {
5979 rbd_warn(NULL, "no pool name provided");
5980 goto out_err;
5981 }
Alex Eldere28fff262012-02-02 08:13:30 -06005982
Alex Elder69e7a022012-11-01 08:39:26 -05005983 spec->image_name = dup_token(&buf, NULL);
Alex Elder859c31d2012-10-25 23:34:42 -05005984 if (!spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005985 goto out_mem;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005986 if (!*spec->image_name) {
5987 rbd_warn(NULL, "no image name provided");
5988 goto out_err;
5989 }
Alex Eldere28fff262012-02-02 08:13:30 -06005990
Alex Elderf28e5652012-10-25 23:34:41 -05005991 /*
5992 * Snapshot name is optional; default is to use "-"
5993 * (indicating the head/no snapshot).
5994 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005995 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005996 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005997 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5998 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005999 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05006000 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05006001 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05006002 }
Alex Elderecb4dc22013-04-26 09:43:47 -05006003 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
6004 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05006005 goto out_mem;
Alex Elderecb4dc22013-04-26 09:43:47 -05006006 *(snap_name + len) = '\0';
6007 spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05006008
Alex Elder0ddebc02012-10-25 23:34:41 -05006009 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06006010
Alex Elder4e9afeb2012-10-25 23:34:41 -05006011 rbd_opts = kzalloc(sizeof (*rbd_opts), GFP_KERNEL);
6012 if (!rbd_opts)
6013 goto out_mem;
6014
6015 rbd_opts->read_only = RBD_READ_ONLY_DEFAULT;
Ilya Dryomovb5584182015-06-23 16:21:19 +03006016 rbd_opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov80de1912016-09-20 14:23:17 +02006017 rbd_opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
Ilya Dryomove010dd02017-04-13 12:17:39 +02006018 rbd_opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05006019
Alex Elder859c31d2012-10-25 23:34:42 -05006020 copts = ceph_parse_options(options, mon_addrs,
Alex Elder0ddebc02012-10-25 23:34:41 -05006021 mon_addrs + mon_addrs_size - 1,
Alex Elder4e9afeb2012-10-25 23:34:41 -05006022 parse_rbd_opts_token, rbd_opts);
Alex Elder859c31d2012-10-25 23:34:42 -05006023 if (IS_ERR(copts)) {
6024 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05006025 goto out_err;
6026 }
Alex Elder859c31d2012-10-25 23:34:42 -05006027 kfree(options);
6028
6029 *ceph_opts = copts;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006030 *opts = rbd_opts;
Alex Elder859c31d2012-10-25 23:34:42 -05006031 *rbd_spec = spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05006032
Alex Elderdc79b112012-10-25 23:34:41 -05006033 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05006034out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05006035 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05006036out_err:
Alex Elder859c31d2012-10-25 23:34:42 -05006037 kfree(rbd_opts);
6038 rbd_spec_put(spec);
Alex Elderf28e5652012-10-25 23:34:41 -05006039 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05006040
Alex Elderdc79b112012-10-25 23:34:41 -05006041 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06006042}
6043
Alex Elder589d30e2012-07-10 20:30:11 -05006044/*
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006045 * Return pool id (>= 0) or a negative error code.
6046 */
6047static int rbd_add_get_pool_id(struct rbd_client *rbdc, const char *pool_name)
6048{
Ilya Dryomova319bf52015-05-15 12:02:17 +03006049 struct ceph_options *opts = rbdc->client->options;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006050 u64 newest_epoch;
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006051 int tries = 0;
6052 int ret;
6053
6054again:
6055 ret = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, pool_name);
6056 if (ret == -ENOENT && tries++ < 1) {
Ilya Dryomovd0b19702016-04-28 16:07:27 +02006057 ret = ceph_monc_get_version(&rbdc->client->monc, "osdmap",
6058 &newest_epoch);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006059 if (ret < 0)
6060 return ret;
6061
6062 if (rbdc->client->osdc.osdmap->epoch < newest_epoch) {
Ilya Dryomov7cca78c2016-04-28 16:07:28 +02006063 ceph_osdc_maybe_request_map(&rbdc->client->osdc);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006064 (void) ceph_monc_wait_osdmap(&rbdc->client->monc,
Ilya Dryomova319bf52015-05-15 12:02:17 +03006065 newest_epoch,
6066 opts->mount_timeout);
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006067 goto again;
6068 } else {
6069 /* the osdmap we have is new enough */
6070 return -ENOENT;
6071 }
6072 }
6073
6074 return ret;
6075}
6076
Ilya Dryomove010dd02017-04-13 12:17:39 +02006077static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
6078{
6079 down_write(&rbd_dev->lock_rwsem);
6080 if (__rbd_is_lock_owner(rbd_dev))
6081 rbd_unlock(rbd_dev);
6082 up_write(&rbd_dev->lock_rwsem);
6083}
6084
6085static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
6086{
6087 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
6088 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
6089 return -EINVAL;
6090 }
6091
6092 /* FIXME: "rbd map --exclusive" should be in interruptible */
6093 down_read(&rbd_dev->lock_rwsem);
6094 rbd_wait_state_locked(rbd_dev);
6095 up_read(&rbd_dev->lock_rwsem);
6096 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
6097 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
6098 return -EROFS;
6099 }
6100
6101 return 0;
6102}
6103
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006104/*
Alex Elder589d30e2012-07-10 20:30:11 -05006105 * An rbd format 2 image has a unique identifier, distinct from the
6106 * name given to it by the user. Internally, that identifier is
6107 * what's used to specify the names of objects related to the image.
6108 *
6109 * A special "rbd id" object is used to map an rbd image name to its
6110 * id. If that object doesn't exist, then there is no v2 rbd image
6111 * with the supplied name.
6112 *
6113 * This function will record the given rbd_dev's image_id field if
6114 * it can be determined, and in that case will return 0. If any
6115 * errors occur a negative errno will be returned and the rbd_dev's
6116 * image_id field will be unchanged (and should be NULL).
6117 */
6118static int rbd_dev_image_id(struct rbd_device *rbd_dev)
6119{
6120 int ret;
6121 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006122 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05006123 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05006124 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05006125
Alex Elder589d30e2012-07-10 20:30:11 -05006126 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05006127 * When probing a parent image, the image id is already
6128 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05006129 * need to fetch the image id again in this case. We
6130 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05006131 */
Alex Elderc0fba362013-04-25 23:15:08 -05006132 if (rbd_dev->spec->image_id) {
6133 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
6134
Alex Elder2c0d0a12012-10-30 19:40:33 -05006135 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05006136 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05006137
6138 /*
Alex Elder589d30e2012-07-10 20:30:11 -05006139 * First, see if the format 2 image id file exists, and if
6140 * so, get the image's persistent id from it.
6141 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006142 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
6143 rbd_dev->spec->image_name);
6144 if (ret)
6145 return ret;
6146
6147 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05006148
6149 /* Response will be an encoded string, which includes a length */
6150
6151 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
6152 response = kzalloc(size, GFP_NOIO);
6153 if (!response) {
6154 ret = -ENOMEM;
6155 goto out;
6156 }
6157
Alex Elderc0fba362013-04-25 23:15:08 -05006158 /* If it doesn't exist we'll assume it's a format 1 image */
6159
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006160 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
6161 "get_id", NULL, 0,
6162 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06006163 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05006164 if (ret == -ENOENT) {
6165 image_id = kstrdup("", GFP_KERNEL);
6166 ret = image_id ? 0 : -ENOMEM;
6167 if (!ret)
6168 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04006169 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05006170 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05006171
Alex Elderc0fba362013-04-25 23:15:08 -05006172 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05006173 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08006174 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05006175 if (!ret)
6176 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05006177 }
6178
6179 if (!ret) {
6180 rbd_dev->spec->image_id = image_id;
6181 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05006182 }
6183out:
6184 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01006185 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05006186 return ret;
6187}
6188
Alex Elder3abef3b2013-05-13 20:35:37 -05006189/*
6190 * Undo whatever state changes are made by v1 or v2 header info
6191 * call.
6192 */
Alex Elder6fd48b32013-04-28 23:32:34 -05006193static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
6194{
6195 struct rbd_image_header *header;
6196
Ilya Dryomove69b8d42015-01-19 12:06:14 +03006197 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006198
6199 /* Free dynamic fields from the header, then zero it out */
6200
6201 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05006202 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05006203 kfree(header->snap_sizes);
6204 kfree(header->snap_names);
6205 kfree(header->object_prefix);
6206 memset(header, 0, sizeof (*header));
6207}
6208
Alex Elder2df3fac2013-05-06 09:51:30 -05006209static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05006210{
6211 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006212
Alex Elder1e130192012-07-03 16:01:19 -05006213 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05006214 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05006215 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05006216
Alex Elder2df3fac2013-05-06 09:51:30 -05006217 /*
6218 * Get the and check features for the image. Currently the
6219 * features are assumed to never change.
6220 */
Alex Elderb1b54022012-07-03 16:01:19 -05006221 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05006222 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05006223 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05006224
Alex Eldercc070d52013-04-21 12:14:45 -05006225 /* If the image supports fancy striping, get its parameters */
6226
6227 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
6228 ret = rbd_dev_v2_striping_info(rbd_dev);
6229 if (ret < 0)
6230 goto out_err;
6231 }
Alex Eldera30b71b2012-07-10 20:30:11 -05006232
Ilya Dryomov7e973322017-01-25 18:16:22 +01006233 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
6234 ret = rbd_dev_v2_data_pool(rbd_dev);
6235 if (ret)
6236 goto out_err;
6237 }
6238
Ilya Dryomov263423f2017-01-25 18:16:22 +01006239 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05006240 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01006241
Alex Elder9d475de2012-07-03 16:01:19 -05006242out_err:
Alex Elder642a2532013-05-06 17:40:33 -05006243 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05006244 kfree(rbd_dev->header.object_prefix);
6245 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05006246 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006247}
6248
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006249/*
6250 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
6251 * rbd_dev_image_probe() recursion depth, which means it's also the
6252 * length of the already discovered part of the parent chain.
6253 */
6254static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05006255{
Alex Elder2f82ee52012-10-30 19:40:33 -05006256 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05006257 int ret;
6258
6259 if (!rbd_dev->parent_spec)
6260 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006261
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006262 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
6263 pr_info("parent chain is too long (%d)\n", depth);
6264 ret = -EINVAL;
6265 goto out_err;
6266 }
6267
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006268 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006269 if (!parent) {
6270 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05006271 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006272 }
6273
6274 /*
6275 * Images related by parent/child relationships always share
6276 * rbd_client and spec/parent_spec, so bump their refcounts.
6277 */
6278 __rbd_get_client(rbd_dev->rbd_client);
6279 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05006280
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006281 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05006282 if (ret < 0)
6283 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006284
Alex Elder124afba2013-04-26 15:44:36 -05006285 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05006286 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05006287 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05006288
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02006289out_err:
6290 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01006291 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05006292 return ret;
6293}
6294
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006295static void rbd_dev_device_release(struct rbd_device *rbd_dev)
6296{
6297 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
6298 rbd_dev_mapping_clear(rbd_dev);
6299 rbd_free_disk(rbd_dev);
6300 if (!single_major)
6301 unregister_blkdev(rbd_dev->major, rbd_dev->name);
6302}
6303
Ilya Dryomov811c6682016-04-15 16:22:16 +02006304/*
6305 * rbd_dev->header_rwsem must be locked for write and will be unlocked
6306 * upon return.
6307 */
Alex Elder200a6a82013-04-28 23:32:34 -05006308static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05006309{
Alex Elder83a06262012-10-30 15:47:17 -05006310 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05006311
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006312 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05006313
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006314 if (!single_major) {
6315 ret = register_blkdev(0, rbd_dev->name);
6316 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02006317 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006318
6319 rbd_dev->major = ret;
6320 rbd_dev->minor = 0;
6321 } else {
6322 rbd_dev->major = rbd_major;
6323 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
6324 }
Alex Elder83a06262012-10-30 15:47:17 -05006325
6326 /* Set up the blkdev mapping. */
6327
6328 ret = rbd_init_disk(rbd_dev);
6329 if (ret)
6330 goto err_out_blkdev;
6331
Alex Elderf35a4de2013-05-06 09:51:29 -05006332 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05006333 if (ret)
6334 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04006335
Alex Elderf35a4de2013-05-06 09:51:29 -05006336 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02006337 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05006338
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006339 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05006340 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006341 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05006342
Alex Elder129b79d2013-04-26 15:44:36 -05006343 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006344 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006345 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05006346
Alex Elderf35a4de2013-05-06 09:51:29 -05006347err_out_mapping:
6348 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05006349err_out_disk:
6350 rbd_free_disk(rbd_dev);
6351err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006352 if (!single_major)
6353 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02006354err_out_unlock:
6355 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05006356 return ret;
6357}
6358
Alex Elder332bb122013-04-27 09:59:30 -05006359static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6360{
6361 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006362 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05006363
6364 /* Record the header object name for this rbd image. */
6365
6366 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05006367 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006368 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6369 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05006370 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006371 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6372 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006373
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006374 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006375}
6376
Alex Elder200a6a82013-04-28 23:32:34 -05006377static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6378{
Alex Elder6fd48b32013-04-28 23:32:34 -05006379 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006380 if (rbd_dev->opts)
6381 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006382 rbd_dev->image_format = 0;
6383 kfree(rbd_dev->spec->image_id);
6384 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05006385}
6386
Alex Eldera30b71b2012-07-10 20:30:11 -05006387/*
6388 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006389 * device. If this image is the one being mapped (i.e., not a
6390 * parent), initiate a watch on its header object before using that
6391 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05006392 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006393static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006394{
6395 int ret;
6396
6397 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006398 * Get the id from the image id object. Unless there's an
6399 * error, rbd_dev->spec->image_id will be filled in with
6400 * a dynamically-allocated string, and rbd_dev->image_format
6401 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006402 */
6403 ret = rbd_dev_image_id(rbd_dev);
6404 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006405 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006406
Alex Elder332bb122013-04-27 09:59:30 -05006407 ret = rbd_dev_header_name(rbd_dev);
6408 if (ret)
6409 goto err_out_format;
6410
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006411 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006412 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006413 if (ret) {
6414 if (ret == -ENOENT)
6415 pr_info("image %s/%s does not exist\n",
6416 rbd_dev->spec->pool_name,
6417 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006418 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006419 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006420 }
Alex Elderb644de22013-04-27 09:59:31 -05006421
Ilya Dryomova720ae02014-07-23 17:11:19 +04006422 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05006423 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05006424 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05006425
Ilya Dryomov04077592014-07-23 17:11:20 +04006426 /*
6427 * If this image is the one being mapped, we have pool name and
6428 * id, image name and id, and snap name - need to fill snap id.
6429 * Otherwise this is a parent image, identified by pool, image
6430 * and snap ids - need to fill in names for those ids.
6431 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006432 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006433 ret = rbd_spec_fill_snap_id(rbd_dev);
6434 else
6435 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006436 if (ret) {
6437 if (ret == -ENOENT)
6438 pr_info("snap %s/%s@%s does not exist\n",
6439 rbd_dev->spec->pool_name,
6440 rbd_dev->spec->image_name,
6441 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05006442 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006443 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006444
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006445 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6446 ret = rbd_dev_v2_parent_info(rbd_dev);
6447 if (ret)
6448 goto err_out_probe;
6449
6450 /*
6451 * Need to warn users if this image is the one being
6452 * mapped and has a parent.
6453 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006454 if (!depth && rbd_dev->parent_spec)
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006455 rbd_warn(rbd_dev,
6456 "WARNING: kernel layering is EXPERIMENTAL!");
6457 }
6458
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006459 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006460 if (ret)
6461 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006462
Alex Elder30d60ba2013-05-06 09:51:30 -05006463 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006464 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006465 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006466
Alex Elder6fd48b32013-04-28 23:32:34 -05006467err_out_probe:
6468 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05006469err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006470 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006471 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006472err_out_format:
6473 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006474 kfree(rbd_dev->spec->image_id);
6475 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006476 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006477}
6478
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006479static ssize_t do_rbd_add(struct bus_type *bus,
6480 const char *buf,
6481 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006482{
Alex Eldercb8627c2012-07-09 21:04:23 -05006483 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006484 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006485 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006486 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006487 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006488 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006489
6490 if (!try_module_get(THIS_MODULE))
6491 return -ENODEV;
6492
Alex Eldera725f65e2012-02-02 08:13:30 -06006493 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006494 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006495 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006496 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006497
Alex Elder9d3997f2012-10-25 23:34:42 -05006498 rbdc = rbd_get_client(ceph_opts);
6499 if (IS_ERR(rbdc)) {
6500 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006501 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006502 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006503
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006504 /* pick the pool */
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04006505 rc = rbd_add_get_pool_id(rbdc, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006506 if (rc < 0) {
6507 if (rc == -ENOENT)
6508 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006509 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006510 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006511 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006512
Ilya Dryomovd1475432015-06-22 13:24:48 +03006513 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006514 if (!rbd_dev) {
6515 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006516 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006517 }
Alex Elderc53d5892012-10-25 23:34:42 -05006518 rbdc = NULL; /* rbd_dev now owns this */
6519 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006520 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006521
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006522 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6523 if (!rbd_dev->config_info) {
6524 rc = -ENOMEM;
6525 goto err_out_rbd_dev;
6526 }
6527
Ilya Dryomov811c6682016-04-15 16:22:16 +02006528 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006529 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006530 if (rc < 0) {
6531 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006532 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006533 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006534
Alex Elder7ce4eef2013-05-06 17:40:33 -05006535 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05006536 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02006537 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006538
Alex Elderb536f692013-04-28 23:32:34 -05006539 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006540 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006541 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05006542
Ilya Dryomove010dd02017-04-13 12:17:39 +02006543 if (rbd_dev->opts->exclusive) {
6544 rc = rbd_add_acquire_lock(rbd_dev);
6545 if (rc)
6546 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05006547 }
6548
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006549 /* Everything's ready. Announce the disk to the world. */
6550
6551 rc = device_add(&rbd_dev->dev);
6552 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02006553 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006554
6555 add_disk(rbd_dev->disk);
6556 /* see rbd_init_disk() */
6557 blk_put_queue(rbd_dev->disk->queue);
6558
6559 spin_lock(&rbd_dev_list_lock);
6560 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6561 spin_unlock(&rbd_dev_list_lock);
6562
6563 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6564 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6565 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006566 rc = count;
6567out:
6568 module_put(THIS_MODULE);
6569 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006570
Ilya Dryomove010dd02017-04-13 12:17:39 +02006571err_out_image_lock:
6572 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006573err_out_device_setup:
6574 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006575err_out_image_probe:
6576 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05006577err_out_rbd_dev:
6578 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006579err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006580 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006581err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006582 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006583 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006584 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006585}
6586
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006587static ssize_t rbd_add(struct bus_type *bus,
6588 const char *buf,
6589 size_t count)
6590{
6591 if (single_major)
6592 return -EINVAL;
6593
6594 return do_rbd_add(bus, buf, count);
6595}
6596
6597static ssize_t rbd_add_single_major(struct bus_type *bus,
6598 const char *buf,
6599 size_t count)
6600{
6601 return do_rbd_add(bus, buf, count);
6602}
6603
Alex Elder05a46af2013-04-26 15:44:36 -05006604static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6605{
Alex Elderad945fc2013-04-26 15:44:36 -05006606 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006607 struct rbd_device *first = rbd_dev;
6608 struct rbd_device *second = first->parent;
6609 struct rbd_device *third;
6610
6611 /*
6612 * Follow to the parent with no grandparent and
6613 * remove it.
6614 */
6615 while (second && (third = second->parent)) {
6616 first = second;
6617 second = third;
6618 }
Alex Elderad945fc2013-04-26 15:44:36 -05006619 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006620 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006621 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006622 first->parent = NULL;
6623 first->parent_overlap = 0;
6624
6625 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006626 rbd_spec_put(first->parent_spec);
6627 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006628 }
6629}
6630
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006631static ssize_t do_rbd_remove(struct bus_type *bus,
6632 const char *buf,
6633 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006634{
6635 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006636 struct list_head *tmp;
6637 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006638 char opt_buf[6];
Alex Elder82a442d2013-05-31 17:40:44 -05006639 bool already = false;
Mike Christie0276dca2016-08-18 18:38:45 +02006640 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006641 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006642
Mike Christie0276dca2016-08-18 18:38:45 +02006643 dev_id = -1;
6644 opt_buf[0] = '\0';
6645 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6646 if (dev_id < 0) {
6647 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006648 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006649 }
6650 if (opt_buf[0] != '\0') {
6651 if (!strcmp(opt_buf, "force")) {
6652 force = true;
6653 } else {
6654 pr_err("bad remove option at '%s'\n", opt_buf);
6655 return -EINVAL;
6656 }
6657 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006658
Alex Elder751cc0e2013-05-31 15:17:01 -05006659 ret = -ENOENT;
6660 spin_lock(&rbd_dev_list_lock);
6661 list_for_each(tmp, &rbd_dev_list) {
6662 rbd_dev = list_entry(tmp, struct rbd_device, node);
6663 if (rbd_dev->dev_id == dev_id) {
6664 ret = 0;
6665 break;
6666 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006667 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006668 if (!ret) {
6669 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006670 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006671 ret = -EBUSY;
6672 else
Alex Elder82a442d2013-05-31 17:40:44 -05006673 already = test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6674 &rbd_dev->flags);
Alex Elder751cc0e2013-05-31 15:17:01 -05006675 spin_unlock_irq(&rbd_dev->lock);
6676 }
6677 spin_unlock(&rbd_dev_list_lock);
Alex Elder82a442d2013-05-31 17:40:44 -05006678 if (ret < 0 || already)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006679 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006680
Mike Christie0276dca2016-08-18 18:38:45 +02006681 if (force) {
6682 /*
6683 * Prevent new IO from being queued and wait for existing
6684 * IO to complete/fail.
6685 */
6686 blk_mq_freeze_queue(rbd_dev->disk->queue);
6687 blk_set_queue_dying(rbd_dev->disk->queue);
6688 }
6689
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006690 del_gendisk(rbd_dev->disk);
6691 spin_lock(&rbd_dev_list_lock);
6692 list_del_init(&rbd_dev->node);
6693 spin_unlock(&rbd_dev_list_lock);
6694 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006695
Ilya Dryomove010dd02017-04-13 12:17:39 +02006696 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006697 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006698 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006699 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006700 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006701}
6702
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006703static ssize_t rbd_remove(struct bus_type *bus,
6704 const char *buf,
6705 size_t count)
6706{
6707 if (single_major)
6708 return -EINVAL;
6709
6710 return do_rbd_remove(bus, buf, count);
6711}
6712
6713static ssize_t rbd_remove_single_major(struct bus_type *bus,
6714 const char *buf,
6715 size_t count)
6716{
6717 return do_rbd_remove(bus, buf, count);
6718}
6719
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006720/*
6721 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006722 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006723 */
6724static int rbd_sysfs_init(void)
6725{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006726 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006727
Alex Elderfed4c142012-02-07 12:03:36 -06006728 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006729 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006730 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006731
Alex Elderfed4c142012-02-07 12:03:36 -06006732 ret = bus_register(&rbd_bus_type);
6733 if (ret < 0)
6734 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006735
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006736 return ret;
6737}
6738
6739static void rbd_sysfs_cleanup(void)
6740{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006741 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006742 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006743}
6744
Alex Elder1c2a9df2013-05-01 12:43:03 -05006745static int rbd_slab_init(void)
6746{
6747 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006748 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006749 if (!rbd_img_request_cache)
6750 return -ENOMEM;
6751
6752 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006753 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006754 if (!rbd_obj_request_cache)
6755 goto out_err;
6756
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006757 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006758
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006759out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006760 kmem_cache_destroy(rbd_img_request_cache);
6761 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006762 return -ENOMEM;
6763}
6764
6765static void rbd_slab_exit(void)
6766{
Alex Elder868311b2013-05-01 12:43:03 -05006767 rbd_assert(rbd_obj_request_cache);
6768 kmem_cache_destroy(rbd_obj_request_cache);
6769 rbd_obj_request_cache = NULL;
6770
Alex Elder1c2a9df2013-05-01 12:43:03 -05006771 rbd_assert(rbd_img_request_cache);
6772 kmem_cache_destroy(rbd_img_request_cache);
6773 rbd_img_request_cache = NULL;
6774}
6775
Alex Eldercc344fa2013-02-19 12:25:56 -06006776static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006777{
6778 int rc;
6779
Alex Elder1e32d342013-01-30 11:13:33 -06006780 if (!libceph_compatible(NULL)) {
6781 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006782 return -EINVAL;
6783 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006784
Alex Elder1c2a9df2013-05-01 12:43:03 -05006785 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006786 if (rc)
6787 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006788
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006789 /*
6790 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006791 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006792 */
6793 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6794 if (!rbd_wq) {
6795 rc = -ENOMEM;
6796 goto err_out_slab;
6797 }
6798
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006799 if (single_major) {
6800 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6801 if (rbd_major < 0) {
6802 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006803 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006804 }
6805 }
6806
Alex Elder1c2a9df2013-05-01 12:43:03 -05006807 rc = rbd_sysfs_init();
6808 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006809 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006810
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006811 if (single_major)
6812 pr_info("loaded (major %d)\n", rbd_major);
6813 else
6814 pr_info("loaded\n");
6815
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006816 return 0;
6817
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006818err_out_blkdev:
6819 if (single_major)
6820 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006821err_out_wq:
6822 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006823err_out_slab:
6824 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006825 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006826}
6827
Alex Eldercc344fa2013-02-19 12:25:56 -06006828static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006829{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006830 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006831 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006832 if (single_major)
6833 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006834 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006835 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006836}
6837
6838module_init(rbd_init);
6839module_exit(rbd_exit);
6840
Alex Elderd552c612013-05-31 20:13:09 -05006841MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006842MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6843MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006844/* following authorship retained from original osdblk.c */
6845MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6846
Ilya Dryomov90da2582013-12-13 15:28:56 +02006847MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006848MODULE_LICENSE("GPL");