blob: c6e0d4ace8acd13962cb0a787e39cd4adb4d53d2 [file] [log] [blame]
Alex Eldere2a58ee2013-04-30 00:44:33 -05001
Yehuda Sadeh602adf42010-08-12 16:11:25 -07002/*
3 rbd.c -- Export ceph rados objects as a Linux block device
4
5
6 based on drivers/block/osdblk.c:
7
8 Copyright 2009 Red Hat, Inc.
9
10 This program is free software; you can redistribute it and/or modify
11 it under the terms of the GNU General Public License as published by
12 the Free Software Foundation.
13
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
18
19 You should have received a copy of the GNU General Public License
20 along with this program; see the file COPYING. If not, write to
21 the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
22
23
24
Yehuda Sadehdfc56062010-11-19 14:51:04 -080025 For usage instructions, please refer to:
Yehuda Sadeh602adf42010-08-12 16:11:25 -070026
Yehuda Sadehdfc56062010-11-19 14:51:04 -080027 Documentation/ABI/testing/sysfs-bus-rbd
Yehuda Sadeh602adf42010-08-12 16:11:25 -070028
29 */
30
31#include <linux/ceph/libceph.h>
32#include <linux/ceph/osd_client.h>
33#include <linux/ceph/mon_client.h>
Ilya Dryomoved95b212016-08-12 16:40:02 +020034#include <linux/ceph/cls_lock_client.h>
Ilya Dryomov43df3d32018-02-02 15:23:22 +010035#include <linux/ceph/striper.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070036#include <linux/ceph/decode.h>
Yehuda Sadeh59c2be12011-03-21 15:10:11 -070037#include <linux/parser.h>
Alex Elder30d1cff2013-05-01 12:43:03 -050038#include <linux/bsearch.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070039
40#include <linux/kernel.h>
41#include <linux/device.h>
42#include <linux/module.h>
Christoph Hellwig7ad18af2015-01-13 17:20:04 +010043#include <linux/blk-mq.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070044#include <linux/fs.h>
45#include <linux/blkdev.h>
Alex Elder1c2a9df2013-05-01 12:43:03 -050046#include <linux/slab.h>
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +020047#include <linux/idr.h>
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +040048#include <linux/workqueue.h>
Yehuda Sadeh602adf42010-08-12 16:11:25 -070049
50#include "rbd_types.h"
51
Alex Elderaafb2302012-09-06 16:00:54 -050052#define RBD_DEBUG /* Activate rbd_assert() calls */
53
Alex Elder593a9e72012-02-07 12:03:37 -060054/*
Alex Eldera2acd002013-05-08 22:50:04 -050055 * Increment the given counter and return its updated value.
56 * If the counter is already 0 it will not be incremented.
57 * If the counter is already at its maximum value returns
58 * -EINVAL without updating it.
59 */
60static int atomic_inc_return_safe(atomic_t *v)
61{
62 unsigned int counter;
63
Mark Rutlandbfc18e32018-06-21 13:13:04 +010064 counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
Alex Eldera2acd002013-05-08 22:50:04 -050065 if (counter <= (unsigned int)INT_MAX)
66 return (int)counter;
67
68 atomic_dec(v);
69
70 return -EINVAL;
71}
72
73/* Decrement the counter. Return the resulting value, or -EINVAL */
74static int atomic_dec_return_safe(atomic_t *v)
75{
76 int counter;
77
78 counter = atomic_dec_return(v);
79 if (counter >= 0)
80 return counter;
81
82 atomic_inc(v);
83
84 return -EINVAL;
85}
86
Alex Elderf0f8cef2012-01-29 13:57:44 -060087#define RBD_DRV_NAME "rbd"
Yehuda Sadeh602adf42010-08-12 16:11:25 -070088
Ilya Dryomov7e513d42013-12-16 19:26:32 +020089#define RBD_MINORS_PER_MAJOR 256
90#define RBD_SINGLE_MAJOR_PART_SHIFT 4
Yehuda Sadeh602adf42010-08-12 16:11:25 -070091
Ilya Dryomov6d69bb532015-10-11 19:38:00 +020092#define RBD_MAX_PARENT_CHAIN_LEN 16
93
Alex Elderd4b125e2012-07-03 16:01:19 -050094#define RBD_SNAP_DEV_NAME_PREFIX "snap_"
95#define RBD_MAX_SNAP_NAME_LEN \
96 (NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
97
Alex Elder35d489f2012-07-03 16:01:19 -050098#define RBD_MAX_SNAP_COUNT 510 /* allows max snapc to fit in 4KB */
Yehuda Sadeh602adf42010-08-12 16:11:25 -070099
100#define RBD_SNAP_HEAD_NAME "-"
101
Alex Elder9682fc62013-04-30 00:44:33 -0500102#define BAD_SNAP_INDEX U32_MAX /* invalid index into snap array */
103
Alex Elder9e15b772012-10-30 19:40:33 -0500104/* This allows a single page to hold an image name sent by OSD */
105#define RBD_IMAGE_NAME_LEN_MAX (PAGE_SIZE - sizeof (__le32) - 1)
Alex Elder1e130192012-07-03 16:01:19 -0500106#define RBD_IMAGE_ID_LEN_MAX 64
Alex Elder9e15b772012-10-30 19:40:33 -0500107
Alex Elder1e130192012-07-03 16:01:19 -0500108#define RBD_OBJ_PREFIX_LEN_MAX 64
Alex Elder589d30e2012-07-10 20:30:11 -0500109
Ilya Dryomoved95b212016-08-12 16:40:02 +0200110#define RBD_NOTIFY_TIMEOUT 5 /* seconds */
Ilya Dryomov99d16942016-08-12 16:11:41 +0200111#define RBD_RETRY_DELAY msecs_to_jiffies(1000)
112
Alex Elderd8891402012-10-09 13:50:17 -0700113/* Feature bits */
114
Ilya Dryomov8767b292017-03-02 19:56:57 +0100115#define RBD_FEATURE_LAYERING (1ULL<<0)
116#define RBD_FEATURE_STRIPINGV2 (1ULL<<1)
117#define RBD_FEATURE_EXCLUSIVE_LOCK (1ULL<<2)
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100118#define RBD_FEATURE_DEEP_FLATTEN (1ULL<<5)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100119#define RBD_FEATURE_DATA_POOL (1ULL<<7)
Ilya Dryomove5734272018-01-16 15:41:54 +0100120#define RBD_FEATURE_OPERATIONS (1ULL<<8)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100121
Ilya Dryomoved95b212016-08-12 16:40:02 +0200122#define RBD_FEATURES_ALL (RBD_FEATURE_LAYERING | \
123 RBD_FEATURE_STRIPINGV2 | \
Ilya Dryomov7e973322017-01-25 18:16:22 +0100124 RBD_FEATURE_EXCLUSIVE_LOCK | \
Ilya Dryomovb9f6d442019-02-25 18:55:38 +0100125 RBD_FEATURE_DEEP_FLATTEN | \
Ilya Dryomove5734272018-01-16 15:41:54 +0100126 RBD_FEATURE_DATA_POOL | \
127 RBD_FEATURE_OPERATIONS)
Alex Elderd8891402012-10-09 13:50:17 -0700128
129/* Features supported by this (client software) implementation. */
130
Alex Elder770eba62012-10-25 23:34:40 -0500131#define RBD_FEATURES_SUPPORTED (RBD_FEATURES_ALL)
Alex Elderd8891402012-10-09 13:50:17 -0700132
Alex Elder81a89792012-02-02 08:13:30 -0600133/*
134 * An RBD device name will be "rbd#", where the "rbd" comes from
135 * RBD_DRV_NAME above, and # is a unique integer identifier.
Alex Elder81a89792012-02-02 08:13:30 -0600136 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700137#define DEV_NAME_LEN 32
138
139/*
140 * block device image metadata (in-memory version)
141 */
142struct rbd_image_header {
Alex Elderf35a4de2013-05-06 09:51:29 -0500143 /* These six fields never change for a given rbd image */
Alex Elder849b4262012-07-09 21:04:24 -0500144 char *object_prefix;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700145 __u8 obj_order;
Alex Elderf35a4de2013-05-06 09:51:29 -0500146 u64 stripe_unit;
147 u64 stripe_count;
Ilya Dryomov7e973322017-01-25 18:16:22 +0100148 s64 data_pool_id;
Alex Elderf35a4de2013-05-06 09:51:29 -0500149 u64 features; /* Might be changeable someday? */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700150
Alex Elderf84344f2012-08-31 17:29:51 -0500151 /* The remaining fields need to be updated occasionally */
152 u64 image_size;
153 struct ceph_snap_context *snapc;
Alex Elderf35a4de2013-05-06 09:51:29 -0500154 char *snap_names; /* format 1 only */
155 u64 *snap_sizes; /* format 1 only */
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700156};
157
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500158/*
159 * An rbd image specification.
160 *
161 * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
Alex Elderc66c6e02012-11-01 08:39:26 -0500162 * identify an image. Each rbd_dev structure includes a pointer to
163 * an rbd_spec structure that encapsulates this identity.
164 *
165 * Each of the id's in an rbd_spec has an associated name. For a
166 * user-mapped image, the names are supplied and the id's associated
167 * with them are looked up. For a layered image, a parent image is
168 * defined by the tuple, and the names are looked up.
169 *
170 * An rbd_dev structure contains a parent_spec pointer which is
171 * non-null if the image it represents is a child in a layered
172 * image. This pointer will refer to the rbd_spec structure used
173 * by the parent rbd_dev for its own identity (i.e., the structure
174 * is shared between the parent and child).
175 *
176 * Since these structures are populated once, during the discovery
177 * phase of image construction, they are effectively immutable so
178 * we make no effort to synchronize access to them.
179 *
180 * Note that code herein does not assume the image name is known (it
181 * could be a null pointer).
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500182 */
183struct rbd_spec {
184 u64 pool_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500185 const char *pool_name;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200186 const char *pool_ns; /* NULL if default, never "" */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500187
Alex Elderecb4dc222013-04-26 09:43:47 -0500188 const char *image_id;
189 const char *image_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500190
191 u64 snap_id;
Alex Elderecb4dc222013-04-26 09:43:47 -0500192 const char *snap_name;
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500193
194 struct kref kref;
195};
196
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700197/*
Alex Elderf0f8cef2012-01-29 13:57:44 -0600198 * an instance of the client. multiple devices may share an rbd client.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700199 */
200struct rbd_client {
201 struct ceph_client *client;
202 struct kref kref;
203 struct list_head node;
204};
205
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200206struct pending_result {
207 int result; /* first nonzero result */
208 int num_pending;
209};
210
Alex Elderbf0d5f502012-11-22 00:00:08 -0600211struct rbd_img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600212
Alex Elder9969ebc2013-01-18 12:31:10 -0600213enum obj_request_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100214 OBJ_REQUEST_NODATA = 1,
Ilya Dryomov5359a172018-01-20 10:30:10 +0100215 OBJ_REQUEST_BIO, /* pointer into provided bio (list) */
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100216 OBJ_REQUEST_BVECS, /* pointer into provided bio_vec array */
Ilya Dryomovafb97882018-02-06 19:26:35 +0100217 OBJ_REQUEST_OWN_BVECS, /* private bio_vec array, doesn't own pages */
Alex Elder9969ebc2013-01-18 12:31:10 -0600218};
Alex Elderbf0d5f502012-11-22 00:00:08 -0600219
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800220enum obj_operation_type {
Ilya Dryomova1fbb5e2018-01-16 12:15:02 +0100221 OBJ_OP_READ = 1,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800222 OBJ_OP_WRITE,
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800223 OBJ_OP_DISCARD,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100224 OBJ_OP_ZEROOUT,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800225};
226
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200227#define RBD_OBJ_FLAG_DELETION (1U << 0)
228#define RBD_OBJ_FLAG_COPYUP_ENABLED (1U << 1)
229
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200230enum rbd_obj_read_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200231 RBD_OBJ_READ_START = 1,
232 RBD_OBJ_READ_OBJECT,
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200233 RBD_OBJ_READ_PARENT,
234};
235
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100236/*
237 * Writes go through the following state machine to deal with
238 * layering:
239 *
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100240 * . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
241 * . | .
242 * . v .
243 * . RBD_OBJ_WRITE_READ_FROM_PARENT. . . .
244 * . | . .
245 * . v v (deep-copyup .
246 * (image . RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC . not needed) .
247 * flattened) v | . .
248 * . v . .
249 * . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . . (copyup .
250 * | not needed) v
251 * v .
252 * done . . . . . . . . . . . . . . . . . .
253 * ^
254 * |
255 * RBD_OBJ_WRITE_FLAT
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100256 *
257 * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100258 * assert_exists guard is needed or not (in some cases it's not needed
259 * even if there is a parent).
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100260 */
261enum rbd_obj_write_state {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200262 RBD_OBJ_WRITE_START = 1,
263 RBD_OBJ_WRITE_OBJECT,
Ilya Dryomov3a482502019-02-28 10:49:12 +0100264 RBD_OBJ_WRITE_READ_FROM_PARENT,
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100265 RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC,
Ilya Dryomov3a482502019-02-28 10:49:12 +0100266 RBD_OBJ_WRITE_COPYUP_OPS,
Alex Elder926f9b32013-02-11 12:33:24 -0600267};
268
Alex Elderbf0d5f502012-11-22 00:00:08 -0600269struct rbd_obj_request {
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100270 struct ceph_object_extent ex;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +0200271 unsigned int flags; /* RBD_OBJ_FLAG_* */
Alex Elderc5b5ef62013-02-11 12:33:24 -0600272 union {
Ilya Dryomova9b67e62019-05-08 13:35:57 +0200273 enum rbd_obj_read_state read_state; /* for reads */
Ilya Dryomov3da691b2018-01-29 14:04:08 +0100274 enum rbd_obj_write_state write_state; /* for writes */
275 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600276
Ilya Dryomov51c35092018-01-29 14:04:08 +0100277 struct rbd_img_request *img_request;
Ilya Dryomov86bd7992018-02-06 19:26:33 +0100278 struct ceph_file_extent *img_extents;
279 u32 num_img_extents;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600280
Alex Elder788e2df2013-01-17 12:25:27 -0600281 union {
Ilya Dryomov5359a172018-01-20 10:30:10 +0100282 struct ceph_bio_iter bio_pos;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600283 struct {
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100284 struct ceph_bvec_iter bvec_pos;
285 u32 bvec_count;
Ilya Dryomovafb97882018-02-06 19:26:35 +0100286 u32 bvec_idx;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600287 };
288 };
Ilya Dryomov7e07efb2018-01-20 10:30:11 +0100289 struct bio_vec *copyup_bvecs;
290 u32 copyup_bvec_count;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600291
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +0200292 struct list_head osd_reqs; /* w/ r_private_item */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600293
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +0200294 struct mutex state_mutex;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600295 struct kref kref;
296};
297
Alex Elder0c425242013-02-08 09:55:49 -0600298enum img_req_flags {
Alex Elder9849e982013-01-24 16:13:36 -0600299 IMG_REQ_CHILD, /* initiator: block = 0, child image = 1 */
Alex Elderd0b2e942013-01-24 16:13:36 -0600300 IMG_REQ_LAYERED, /* ENOENT handling: normal = 0, layered = 1 */
Alex Elder0c425242013-02-08 09:55:49 -0600301};
302
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200303enum rbd_img_state {
304 RBD_IMG_START = 1,
305 __RBD_IMG_OBJECT_REQUESTS,
306 RBD_IMG_OBJECT_REQUESTS,
307};
308
Alex Elderbf0d5f502012-11-22 00:00:08 -0600309struct rbd_img_request {
Alex Elderbf0d5f502012-11-22 00:00:08 -0600310 struct rbd_device *rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +0100311 enum obj_operation_type op_type;
Ilya Dryomovecc633c2018-02-01 11:50:47 +0100312 enum obj_request_type data_type;
Alex Elder0c425242013-02-08 09:55:49 -0600313 unsigned long flags;
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200314 enum rbd_img_state state;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600315 union {
Alex Elder9849e982013-01-24 16:13:36 -0600316 u64 snap_id; /* for reads */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600317 struct ceph_snap_context *snapc; /* for writes */
Alex Elder9849e982013-01-24 16:13:36 -0600318 };
319 union {
320 struct request *rq; /* block request */
321 struct rbd_obj_request *obj_request; /* obj req initiator */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600322 };
Alex Elderbf0d5f502012-11-22 00:00:08 -0600323
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100324 struct list_head object_extents; /* obj_req.ex structs */
Alex Elderbf0d5f502012-11-22 00:00:08 -0600325
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200326 struct mutex state_mutex;
327 struct pending_result pending;
328 struct work_struct work;
329 int work_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -0600330 struct kref kref;
331};
332
333#define for_each_obj_request(ireq, oreq) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100334 list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600335#define for_each_obj_request_safe(ireq, oreq, n) \
Ilya Dryomov43df3d32018-02-02 15:23:22 +0100336 list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
Alex Elderbf0d5f502012-11-22 00:00:08 -0600337
Ilya Dryomov99d16942016-08-12 16:11:41 +0200338enum rbd_watch_state {
339 RBD_WATCH_STATE_UNREGISTERED,
340 RBD_WATCH_STATE_REGISTERED,
341 RBD_WATCH_STATE_ERROR,
342};
343
Ilya Dryomoved95b212016-08-12 16:40:02 +0200344enum rbd_lock_state {
345 RBD_LOCK_STATE_UNLOCKED,
346 RBD_LOCK_STATE_LOCKED,
347 RBD_LOCK_STATE_RELEASING,
348};
349
350/* WatchNotify::ClientId */
351struct rbd_client_id {
352 u64 gid;
353 u64 handle;
354};
355
Alex Elderf84344f2012-08-31 17:29:51 -0500356struct rbd_mapping {
Alex Elder99c1f082012-08-30 14:42:15 -0500357 u64 size;
Alex Elder34b13182012-07-13 20:35:12 -0500358 u64 features;
Alex Elderf84344f2012-08-31 17:29:51 -0500359};
360
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700361/*
362 * a single device
363 */
364struct rbd_device {
Alex Elderde71a292012-07-03 16:01:19 -0500365 int dev_id; /* blkdev unique id */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700366
367 int major; /* blkdev assigned major */
Ilya Dryomovdd82fff2013-12-13 15:28:57 +0200368 int minor;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700369 struct gendisk *disk; /* blkdev's gendisk and rq */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700370
Alex Eldera30b71b2012-07-10 20:30:11 -0500371 u32 image_format; /* Either 1 or 2 */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700372 struct rbd_client *rbd_client;
373
374 char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
375
Alex Elderb82d1672013-01-14 12:43:31 -0600376 spinlock_t lock; /* queue, flags, open_count */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700377
378 struct rbd_image_header header;
Alex Elderb82d1672013-01-14 12:43:31 -0600379 unsigned long flags; /* possibly lock protected */
Alex Elder0d7dbfc2012-10-25 23:34:41 -0500380 struct rbd_spec *spec;
Ilya Dryomovd1475432015-06-22 13:24:48 +0300381 struct rbd_options *opts;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +0200382 char *config_info; /* add{,_single_major} string */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700383
Ilya Dryomovc41d13a2016-04-29 20:01:25 +0200384 struct ceph_object_id header_oid;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200385 struct ceph_object_locator header_oloc;
Alex Elder971f8392012-10-25 23:34:41 -0500386
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200387 struct ceph_file_layout layout; /* used for all rbd requests */
Alex Elder0903e872012-11-14 12:25:19 -0600388
Ilya Dryomov99d16942016-08-12 16:11:41 +0200389 struct mutex watch_mutex;
390 enum rbd_watch_state watch_state;
Ilya Dryomov922dab62016-05-26 01:15:02 +0200391 struct ceph_osd_linger_request *watch_handle;
Ilya Dryomov99d16942016-08-12 16:11:41 +0200392 u64 watch_cookie;
393 struct delayed_work watch_dwork;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700394
Ilya Dryomoved95b212016-08-12 16:40:02 +0200395 struct rw_semaphore lock_rwsem;
396 enum rbd_lock_state lock_state;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +0200397 char lock_cookie[32];
Ilya Dryomoved95b212016-08-12 16:40:02 +0200398 struct rbd_client_id owner_cid;
399 struct work_struct acquired_lock_work;
400 struct work_struct released_lock_work;
401 struct delayed_work lock_dwork;
402 struct work_struct unlock_work;
403 wait_queue_head_t lock_waitq;
404
Ilya Dryomov1643dfa2016-08-12 15:45:52 +0200405 struct workqueue_struct *task_wq;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700406
Alex Elder86b00e02012-10-25 23:34:42 -0500407 struct rbd_spec *parent_spec;
408 u64 parent_overlap;
Alex Eldera2acd002013-05-08 22:50:04 -0500409 atomic_t parent_ref;
Alex Elder2f82ee52012-10-30 19:40:33 -0500410 struct rbd_device *parent;
Alex Elder86b00e02012-10-25 23:34:42 -0500411
Christoph Hellwig7ad18af2015-01-13 17:20:04 +0100412 /* Block layer tags. */
413 struct blk_mq_tag_set tag_set;
414
Josh Durginc6666012011-11-21 17:11:12 -0800415 /* protects updating the header */
416 struct rw_semaphore header_rwsem;
Alex Elderf84344f2012-08-31 17:29:51 -0500417
418 struct rbd_mapping mapping;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700419
420 struct list_head node;
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800421
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800422 /* sysfs related */
423 struct device dev;
Alex Elderb82d1672013-01-14 12:43:31 -0600424 unsigned long open_count; /* protected by lock */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800425};
426
Alex Elderb82d1672013-01-14 12:43:31 -0600427/*
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200428 * Flag bits for rbd_dev->flags:
429 * - REMOVING (which is coupled with rbd_dev->open_count) is protected
430 * by rbd_dev->lock
431 * - BLACKLISTED is protected by rbd_dev->lock_rwsem
Alex Elderb82d1672013-01-14 12:43:31 -0600432 */
Alex Elder6d292902013-01-14 12:43:31 -0600433enum rbd_dev_flags {
434 RBD_DEV_FLAG_EXISTS, /* mapped snapshot has not been deleted */
Alex Elderb82d1672013-01-14 12:43:31 -0600435 RBD_DEV_FLAG_REMOVING, /* this mapping is being removed */
Ilya Dryomov87c0fde2016-09-29 13:41:05 +0200436 RBD_DEV_FLAG_BLACKLISTED, /* our ceph_client is blacklisted */
Alex Elder6d292902013-01-14 12:43:31 -0600437};
438
Alex Eldercfbf6372013-05-31 17:40:45 -0500439static DEFINE_MUTEX(client_mutex); /* Serialize client creation */
Alex Eldere124a82f2012-01-29 13:57:44 -0600440
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700441static LIST_HEAD(rbd_dev_list); /* devices */
Alex Eldere124a82f2012-01-29 13:57:44 -0600442static DEFINE_SPINLOCK(rbd_dev_list_lock);
443
Alex Elder432b8582012-01-29 13:57:44 -0600444static LIST_HEAD(rbd_client_list); /* clients */
445static DEFINE_SPINLOCK(rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700446
Alex Elder78c2a442013-05-01 12:43:04 -0500447/* Slab caches for frequently-allocated structures */
448
Alex Elder1c2a9df2013-05-01 12:43:03 -0500449static struct kmem_cache *rbd_img_request_cache;
Alex Elder868311b2013-05-01 12:43:03 -0500450static struct kmem_cache *rbd_obj_request_cache;
Alex Elder1c2a9df2013-05-01 12:43:03 -0500451
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200452static int rbd_major;
Ilya Dryomovf8a22fc2013-12-13 15:28:57 +0200453static DEFINE_IDA(rbd_dev_id_ida);
454
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +0400455static struct workqueue_struct *rbd_wq;
456
Ilya Dryomov89a59c12019-02-28 14:20:28 +0100457static struct ceph_snap_context rbd_empty_snapc = {
458 .nref = REFCOUNT_INIT(1),
459};
460
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200461/*
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100462 * single-major requires >= 0.75 version of userspace rbd utility.
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200463 */
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100464static bool single_major = true;
Joe Perches5657a812018-05-24 13:38:59 -0600465module_param(single_major, bool, 0444);
Ilya Dryomov3cfa3b12017-11-13 10:35:40 +0100466MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200467
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100468static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
469static ssize_t remove_store(struct bus_type *bus, const char *buf,
470 size_t count);
471static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
472 size_t count);
473static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
474 size_t count);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +0200475static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600476
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200477static int rbd_dev_id_to_minor(int dev_id)
478{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200479 return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200480}
481
482static int minor_to_rbd_dev_id(int minor)
483{
Ilya Dryomov7e513d42013-12-16 19:26:32 +0200484 return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200485}
486
Ilya Dryomoved95b212016-08-12 16:40:02 +0200487static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
488{
489 return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
490 rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
491}
492
493static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
494{
495 bool is_lock_owner;
496
497 down_read(&rbd_dev->lock_rwsem);
498 is_lock_owner = __rbd_is_lock_owner(rbd_dev);
499 up_read(&rbd_dev->lock_rwsem);
500 return is_lock_owner;
501}
502
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100503static ssize_t supported_features_show(struct bus_type *bus, char *buf)
Ilya Dryomov8767b292017-03-02 19:56:57 +0100504{
505 return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
506}
507
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +0100508static BUS_ATTR_WO(add);
509static BUS_ATTR_WO(remove);
510static BUS_ATTR_WO(add_single_major);
511static BUS_ATTR_WO(remove_single_major);
512static BUS_ATTR_RO(supported_features);
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700513
514static struct attribute *rbd_bus_attrs[] = {
515 &bus_attr_add.attr,
516 &bus_attr_remove.attr,
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200517 &bus_attr_add_single_major.attr,
518 &bus_attr_remove_single_major.attr,
Ilya Dryomov8767b292017-03-02 19:56:57 +0100519 &bus_attr_supported_features.attr,
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700520 NULL,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600521};
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200522
523static umode_t rbd_bus_is_visible(struct kobject *kobj,
524 struct attribute *attr, int index)
525{
Ilya Dryomov9b60e702013-12-13 15:28:57 +0200526 if (!single_major &&
527 (attr == &bus_attr_add_single_major.attr ||
528 attr == &bus_attr_remove_single_major.attr))
529 return 0;
530
Ilya Dryomov92c76dc2013-12-13 15:28:57 +0200531 return attr->mode;
532}
533
534static const struct attribute_group rbd_bus_group = {
535 .attrs = rbd_bus_attrs,
536 .is_visible = rbd_bus_is_visible,
537};
538__ATTRIBUTE_GROUPS(rbd_bus);
Alex Elderf0f8cef2012-01-29 13:57:44 -0600539
540static struct bus_type rbd_bus_type = {
541 .name = "rbd",
Greg Kroah-Hartmanb15a21d2013-08-23 14:24:28 -0700542 .bus_groups = rbd_bus_groups,
Alex Elderf0f8cef2012-01-29 13:57:44 -0600543};
544
545static void rbd_root_dev_release(struct device *dev)
546{
547}
548
549static struct device rbd_root_dev = {
550 .init_name = "rbd",
551 .release = rbd_root_dev_release,
552};
553
Alex Elder06ecc6c2012-11-01 10:17:15 -0500554static __printf(2, 3)
555void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
556{
557 struct va_format vaf;
558 va_list args;
559
560 va_start(args, fmt);
561 vaf.fmt = fmt;
562 vaf.va = &args;
563
564 if (!rbd_dev)
565 printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
566 else if (rbd_dev->disk)
567 printk(KERN_WARNING "%s: %s: %pV\n",
568 RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
569 else if (rbd_dev->spec && rbd_dev->spec->image_name)
570 printk(KERN_WARNING "%s: image %s: %pV\n",
571 RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
572 else if (rbd_dev->spec && rbd_dev->spec->image_id)
573 printk(KERN_WARNING "%s: id %s: %pV\n",
574 RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
575 else /* punt */
576 printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
577 RBD_DRV_NAME, rbd_dev, &vaf);
578 va_end(args);
579}
580
Alex Elderaafb2302012-09-06 16:00:54 -0500581#ifdef RBD_DEBUG
582#define rbd_assert(expr) \
583 if (unlikely(!(expr))) { \
584 printk(KERN_ERR "\nAssertion failure in %s() " \
585 "at line %d:\n\n" \
586 "\trbd_assert(%s);\n\n", \
587 __func__, __LINE__, #expr); \
588 BUG(); \
589 }
590#else /* !RBD_DEBUG */
591# define rbd_assert(expr) ((void) 0)
592#endif /* !RBD_DEBUG */
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800593
Alex Elder05a46af2013-04-26 15:44:36 -0500594static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
Alex Elder8b3e1a52013-01-24 16:13:36 -0600595
Alex Eldercc4a38bd2013-04-30 00:44:33 -0500596static int rbd_dev_refresh(struct rbd_device *rbd_dev);
Alex Elder2df3fac2013-05-06 09:51:30 -0500597static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev);
Ilya Dryomova720ae02014-07-23 17:11:19 +0400598static int rbd_dev_header_info(struct rbd_device *rbd_dev);
Ilya Dryomove8f59b52014-07-24 10:42:13 +0400599static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev);
Alex Elder54cac612013-04-30 00:44:33 -0500600static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
601 u64 snap_id);
Alex Elder2ad3d712013-04-30 00:44:33 -0500602static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
603 u8 *order, u64 *snap_size);
604static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
605 u64 *snap_features);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700606
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200607static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
Ilya Dryomov0192ce22019-05-16 15:06:56 +0200608static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
609
610/*
611 * Return true if nothing else is pending.
612 */
613static bool pending_result_dec(struct pending_result *pending, int *result)
614{
615 rbd_assert(pending->num_pending > 0);
616
617 if (*result && !pending->result)
618 pending->result = *result;
619 if (--pending->num_pending)
620 return false;
621
622 *result = pending->result;
623 return true;
624}
Ilya Dryomov54ab3b22019-05-11 16:21:49 +0200625
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700626static int rbd_open(struct block_device *bdev, fmode_t mode)
627{
Alex Elderf0f8cef2012-01-29 13:57:44 -0600628 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600629 bool removing = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700630
Alex Eldera14ea262013-02-05 13:23:12 -0600631 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600632 if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
633 removing = true;
634 else
635 rbd_dev->open_count++;
Alex Eldera14ea262013-02-05 13:23:12 -0600636 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600637 if (removing)
638 return -ENOENT;
639
Alex Elderc3e946c2012-11-16 09:29:16 -0600640 (void) get_device(&rbd_dev->dev);
Alex Elder340c7a22012-08-10 13:12:07 -0700641
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700642 return 0;
643}
644
Al Virodb2a1442013-05-05 21:52:57 -0400645static void rbd_release(struct gendisk *disk, fmode_t mode)
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800646{
647 struct rbd_device *rbd_dev = disk->private_data;
Alex Elderb82d1672013-01-14 12:43:31 -0600648 unsigned long open_count_before;
649
Alex Eldera14ea262013-02-05 13:23:12 -0600650 spin_lock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600651 open_count_before = rbd_dev->open_count--;
Alex Eldera14ea262013-02-05 13:23:12 -0600652 spin_unlock_irq(&rbd_dev->lock);
Alex Elderb82d1672013-01-14 12:43:31 -0600653 rbd_assert(open_count_before > 0);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800654
Alex Elderc3e946c2012-11-16 09:29:16 -0600655 put_device(&rbd_dev->dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800656}
657
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800658static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg)
659{
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200660 int ro;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800661
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200662 if (get_user(ro, (int __user *)arg))
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800663 return -EFAULT;
664
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200665 /* Snapshots can't be marked read-write */
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800666 if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro)
667 return -EROFS;
668
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200669 /* Let blkdev_roset() handle it */
670 return -ENOTTY;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800671}
672
673static int rbd_ioctl(struct block_device *bdev, fmode_t mode,
674 unsigned int cmd, unsigned long arg)
675{
676 struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
Ilya Dryomov1de797b2017-10-12 12:35:19 +0200677 int ret;
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800678
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800679 switch (cmd) {
680 case BLKROSET:
681 ret = rbd_ioctl_set_ro(rbd_dev, arg);
682 break;
683 default:
684 ret = -ENOTTY;
685 }
686
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800687 return ret;
688}
689
690#ifdef CONFIG_COMPAT
691static int rbd_compat_ioctl(struct block_device *bdev, fmode_t mode,
692 unsigned int cmd, unsigned long arg)
693{
694 return rbd_ioctl(bdev, mode, cmd, arg);
695}
696#endif /* CONFIG_COMPAT */
697
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700698static const struct block_device_operations rbd_bd_ops = {
699 .owner = THIS_MODULE,
700 .open = rbd_open,
Yehuda Sadehdfc56062010-11-19 14:51:04 -0800701 .release = rbd_release,
Guangliang Zhao131fd9f2013-09-24 11:25:36 +0800702 .ioctl = rbd_ioctl,
703#ifdef CONFIG_COMPAT
704 .compat_ioctl = rbd_compat_ioctl,
705#endif
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700706};
707
708/*
Alex Elder7262cfc2013-05-16 15:04:20 -0500709 * Initialize an rbd client instance. Success or not, this function
Alex Eldercfbf6372013-05-31 17:40:45 -0500710 * consumes ceph_opts. Caller holds client_mutex.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700711 */
Alex Elderf8c38922012-08-10 13:12:07 -0700712static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700713{
714 struct rbd_client *rbdc;
715 int ret = -ENOMEM;
716
Alex Elder37206ee2013-02-20 17:32:08 -0600717 dout("%s:\n", __func__);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700718 rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
719 if (!rbdc)
720 goto out_opt;
721
722 kref_init(&rbdc->kref);
723 INIT_LIST_HEAD(&rbdc->node);
724
Ilya Dryomov74da4a0f2017-03-03 18:16:07 +0100725 rbdc->client = ceph_create_client(ceph_opts, rbdc);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700726 if (IS_ERR(rbdc->client))
Alex Elder08f75462013-05-29 11:19:00 -0500727 goto out_rbdc;
Alex Elder43ae4702012-07-03 16:01:18 -0500728 ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700729
730 ret = ceph_open_session(rbdc->client);
731 if (ret < 0)
Alex Elder08f75462013-05-29 11:19:00 -0500732 goto out_client;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700733
Alex Elder432b8582012-01-29 13:57:44 -0600734 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700735 list_add_tail(&rbdc->node, &rbd_client_list);
Alex Elder432b8582012-01-29 13:57:44 -0600736 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700737
Alex Elder37206ee2013-02-20 17:32:08 -0600738 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Elderbc534d82012-01-29 13:57:44 -0600739
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700740 return rbdc;
Alex Elder08f75462013-05-29 11:19:00 -0500741out_client:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700742 ceph_destroy_client(rbdc->client);
Alex Elder08f75462013-05-29 11:19:00 -0500743out_rbdc:
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700744 kfree(rbdc);
745out_opt:
Alex Elder43ae4702012-07-03 16:01:18 -0500746 if (ceph_opts)
747 ceph_destroy_options(ceph_opts);
Alex Elder37206ee2013-02-20 17:32:08 -0600748 dout("%s: error %d\n", __func__, ret);
749
Vasiliy Kulikov28f259b2010-09-26 12:59:37 +0400750 return ERR_PTR(ret);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700751}
752
Alex Elder2f82ee52012-10-30 19:40:33 -0500753static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
754{
755 kref_get(&rbdc->kref);
756
757 return rbdc;
758}
759
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700760/*
Alex Elder1f7ba332012-08-10 13:12:07 -0700761 * Find a ceph client with specific addr and configuration. If
762 * found, bump its reference count.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700763 */
Alex Elder1f7ba332012-08-10 13:12:07 -0700764static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700765{
766 struct rbd_client *client_node;
Alex Elder1f7ba332012-08-10 13:12:07 -0700767 bool found = false;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700768
Alex Elder43ae4702012-07-03 16:01:18 -0500769 if (ceph_opts->flags & CEPH_OPT_NOSHARE)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700770 return NULL;
771
Alex Elder1f7ba332012-08-10 13:12:07 -0700772 spin_lock(&rbd_client_list_lock);
773 list_for_each_entry(client_node, &rbd_client_list, node) {
774 if (!ceph_compare_options(ceph_opts, client_node->client)) {
Alex Elder2f82ee52012-10-30 19:40:33 -0500775 __rbd_get_client(client_node);
776
Alex Elder1f7ba332012-08-10 13:12:07 -0700777 found = true;
778 break;
779 }
780 }
781 spin_unlock(&rbd_client_list_lock);
782
783 return found ? client_node : NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700784}
785
786/*
Ilya Dryomov210c1042015-06-22 13:24:48 +0300787 * (Per device) rbd map options
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700788 */
789enum {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300790 Opt_queue_depth,
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100791 Opt_alloc_size,
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400792 Opt_lock_timeout,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700793 Opt_last_int,
794 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200795 Opt_pool_ns,
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700796 Opt_last_string,
797 /* string args above */
Alex Eldercc0538b2012-08-10 13:12:07 -0700798 Opt_read_only,
799 Opt_read_write,
Ilya Dryomov80de1912016-09-20 14:23:17 +0200800 Opt_lock_on_read,
Ilya Dryomove010dd02017-04-13 12:17:39 +0200801 Opt_exclusive,
Ilya Dryomovd9360542018-03-23 06:14:47 +0100802 Opt_notrim,
Ilya Dryomov210c1042015-06-22 13:24:48 +0300803 Opt_err
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700804};
805
Alex Elder43ae4702012-07-03 16:01:18 -0500806static match_table_t rbd_opts_tokens = {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300807 {Opt_queue_depth, "queue_depth=%d"},
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100808 {Opt_alloc_size, "alloc_size=%d"},
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400809 {Opt_lock_timeout, "lock_timeout=%d"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700810 /* int args above */
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200811 {Opt_pool_ns, "_pool_ns=%s"},
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700812 /* string args above */
Alex Elderbe466c12012-10-22 11:31:26 -0500813 {Opt_read_only, "read_only"},
Alex Eldercc0538b2012-08-10 13:12:07 -0700814 {Opt_read_only, "ro"}, /* Alternate spelling */
815 {Opt_read_write, "read_write"},
816 {Opt_read_write, "rw"}, /* Alternate spelling */
Ilya Dryomov80de1912016-09-20 14:23:17 +0200817 {Opt_lock_on_read, "lock_on_read"},
Ilya Dryomove010dd02017-04-13 12:17:39 +0200818 {Opt_exclusive, "exclusive"},
Ilya Dryomovd9360542018-03-23 06:14:47 +0100819 {Opt_notrim, "notrim"},
Ilya Dryomov210c1042015-06-22 13:24:48 +0300820 {Opt_err, NULL}
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700821};
822
Alex Elder98571b52013-01-20 14:44:42 -0600823struct rbd_options {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300824 int queue_depth;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100825 int alloc_size;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400826 unsigned long lock_timeout;
Alex Elder98571b52013-01-20 14:44:42 -0600827 bool read_only;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200828 bool lock_on_read;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200829 bool exclusive;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100830 bool trim;
Alex Elder98571b52013-01-20 14:44:42 -0600831};
832
Ilya Dryomovb5584182015-06-23 16:21:19 +0300833#define RBD_QUEUE_DEPTH_DEFAULT BLKDEV_MAX_RQ
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100834#define RBD_ALLOC_SIZE_DEFAULT (64 * 1024)
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400835#define RBD_LOCK_TIMEOUT_DEFAULT 0 /* no timeout */
Alex Elder98571b52013-01-20 14:44:42 -0600836#define RBD_READ_ONLY_DEFAULT false
Ilya Dryomov80de1912016-09-20 14:23:17 +0200837#define RBD_LOCK_ON_READ_DEFAULT false
Ilya Dryomove010dd02017-04-13 12:17:39 +0200838#define RBD_EXCLUSIVE_DEFAULT false
Ilya Dryomovd9360542018-03-23 06:14:47 +0100839#define RBD_TRIM_DEFAULT true
Alex Elder98571b52013-01-20 14:44:42 -0600840
Ilya Dryomovc3001562018-07-03 15:28:43 +0200841struct parse_rbd_opts_ctx {
842 struct rbd_spec *spec;
843 struct rbd_options *opts;
844};
845
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700846static int parse_rbd_opts_token(char *c, void *private)
847{
Ilya Dryomovc3001562018-07-03 15:28:43 +0200848 struct parse_rbd_opts_ctx *pctx = private;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700849 substring_t argstr[MAX_OPT_ARGS];
850 int token, intval, ret;
851
Alex Elder43ae4702012-07-03 16:01:18 -0500852 token = match_token(c, rbd_opts_tokens, argstr);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700853 if (token < Opt_last_int) {
854 ret = match_int(&argstr[0], &intval);
855 if (ret < 0) {
Ilya Dryomov2f56b6b2018-06-27 16:38:13 +0200856 pr_err("bad option arg (not int) at '%s'\n", c);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700857 return ret;
858 }
859 dout("got int token %d val %d\n", token, intval);
860 } else if (token > Opt_last_int && token < Opt_last_string) {
Ilya Dryomov210c1042015-06-22 13:24:48 +0300861 dout("got string token %d val %s\n", token, argstr[0].from);
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700862 } else {
863 dout("got token %d\n", token);
864 }
865
866 switch (token) {
Ilya Dryomovb5584182015-06-23 16:21:19 +0300867 case Opt_queue_depth:
868 if (intval < 1) {
869 pr_err("queue_depth out of range\n");
870 return -EINVAL;
871 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200872 pctx->opts->queue_depth = intval;
Ilya Dryomovb5584182015-06-23 16:21:19 +0300873 break;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100874 case Opt_alloc_size:
Ilya Dryomov16d80c52019-03-15 14:50:04 +0100875 if (intval < SECTOR_SIZE) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +0100876 pr_err("alloc_size out of range\n");
877 return -EINVAL;
878 }
879 if (!is_power_of_2(intval)) {
880 pr_err("alloc_size must be a power of 2\n");
881 return -EINVAL;
882 }
883 pctx->opts->alloc_size = intval;
884 break;
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400885 case Opt_lock_timeout:
886 /* 0 is "wait forever" (i.e. infinite timeout) */
887 if (intval < 0 || intval > INT_MAX / 1000) {
888 pr_err("lock_timeout out of range\n");
889 return -EINVAL;
890 }
Ilya Dryomovc3001562018-07-03 15:28:43 +0200891 pctx->opts->lock_timeout = msecs_to_jiffies(intval * 1000);
Dongsheng Yang34f55d02018-03-26 10:22:55 -0400892 break;
Ilya Dryomovb26c0472018-07-03 15:28:43 +0200893 case Opt_pool_ns:
894 kfree(pctx->spec->pool_ns);
895 pctx->spec->pool_ns = match_strdup(argstr);
896 if (!pctx->spec->pool_ns)
897 return -ENOMEM;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700898 break;
Alex Eldercc0538b2012-08-10 13:12:07 -0700899 case Opt_read_only:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200900 pctx->opts->read_only = true;
Alex Eldercc0538b2012-08-10 13:12:07 -0700901 break;
902 case Opt_read_write:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200903 pctx->opts->read_only = false;
Alex Eldercc0538b2012-08-10 13:12:07 -0700904 break;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200905 case Opt_lock_on_read:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200906 pctx->opts->lock_on_read = true;
Ilya Dryomov80de1912016-09-20 14:23:17 +0200907 break;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200908 case Opt_exclusive:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200909 pctx->opts->exclusive = true;
Ilya Dryomove010dd02017-04-13 12:17:39 +0200910 break;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100911 case Opt_notrim:
Ilya Dryomovc3001562018-07-03 15:28:43 +0200912 pctx->opts->trim = false;
Ilya Dryomovd9360542018-03-23 06:14:47 +0100913 break;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700914 default:
Ilya Dryomov210c1042015-06-22 13:24:48 +0300915 /* libceph prints "bad option" msg */
916 return -EINVAL;
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700917 }
Ilya Dryomov210c1042015-06-22 13:24:48 +0300918
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700919 return 0;
920}
921
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800922static char* obj_op_name(enum obj_operation_type op_type)
923{
924 switch (op_type) {
925 case OBJ_OP_READ:
926 return "read";
927 case OBJ_OP_WRITE:
928 return "write";
Guangliang Zhao90e98c52014-04-01 22:22:16 +0800929 case OBJ_OP_DISCARD:
930 return "discard";
Ilya Dryomov6484cbe2019-01-29 12:46:25 +0100931 case OBJ_OP_ZEROOUT:
932 return "zeroout";
Guangliang Zhao6d2940c2014-03-13 11:21:35 +0800933 default:
934 return "???";
935 }
936}
937
Yehuda Sadeh59c2be12011-03-21 15:10:11 -0700938/*
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700939 * Destroy ceph client
Alex Elderd23a4b32012-01-29 13:57:43 -0600940 *
Alex Elder432b8582012-01-29 13:57:44 -0600941 * Caller must hold rbd_client_list_lock.
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700942 */
943static void rbd_client_release(struct kref *kref)
944{
945 struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
946
Alex Elder37206ee2013-02-20 17:32:08 -0600947 dout("%s: rbdc %p\n", __func__, rbdc);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500948 spin_lock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700949 list_del(&rbdc->node);
Alex Eldercd9d9f52012-04-04 13:35:44 -0500950 spin_unlock(&rbd_client_list_lock);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700951
952 ceph_destroy_client(rbdc->client);
953 kfree(rbdc);
954}
955
956/*
957 * Drop reference to ceph client node. If it's not referenced anymore, release
958 * it.
959 */
Alex Elder9d3997f2012-10-25 23:34:42 -0500960static void rbd_put_client(struct rbd_client *rbdc)
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700961{
Alex Elderc53d5892012-10-25 23:34:42 -0500962 if (rbdc)
963 kref_put(&rbdc->kref, rbd_client_release);
Yehuda Sadeh602adf42010-08-12 16:11:25 -0700964}
965
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100966/*
967 * Get a ceph client with specific addr and configuration, if one does
968 * not exist create it. Either way, ceph_opts is consumed by this
969 * function.
970 */
971static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
972{
973 struct rbd_client *rbdc;
Ilya Dryomovdd435852018-02-22 13:43:24 +0100974 int ret;
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100975
Ilya Dryomova32e4142019-05-02 15:56:00 +0200976 mutex_lock(&client_mutex);
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100977 rbdc = rbd_client_find(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100978 if (rbdc) {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100979 ceph_destroy_options(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100980
981 /*
982 * Using an existing client. Make sure ->pg_pools is up to
983 * date before we look up the pool id in do_rbd_add().
984 */
Ilya Dryomov9d4a2272019-03-20 10:58:05 +0100985 ret = ceph_wait_for_latest_osdmap(rbdc->client,
986 rbdc->client->options->mount_timeout);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100987 if (ret) {
988 rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
989 rbd_put_client(rbdc);
990 rbdc = ERR_PTR(ret);
991 }
992 } else {
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100993 rbdc = rbd_client_create(ceph_opts);
Ilya Dryomovdd435852018-02-22 13:43:24 +0100994 }
Ilya Dryomov5feb0d8d2018-02-22 13:19:04 +0100995 mutex_unlock(&client_mutex);
996
997 return rbdc;
998}
999
Alex Eldera30b71b2012-07-10 20:30:11 -05001000static bool rbd_image_format_valid(u32 image_format)
1001{
1002 return image_format == 1 || image_format == 2;
1003}
1004
Alex Elder8e94af82012-07-25 09:32:40 -05001005static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
1006{
Alex Elder103a1502012-08-02 11:29:45 -05001007 size_t size;
1008 u32 snap_count;
1009
1010 /* The header has to start with the magic rbd header text */
1011 if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
1012 return false;
1013
Alex Elderdb2388b2012-10-20 22:17:27 -05001014 /* The bio layer requires at least sector-sized I/O */
1015
1016 if (ondisk->options.order < SECTOR_SHIFT)
1017 return false;
1018
1019 /* If we use u64 in a few spots we may be able to loosen this */
1020
1021 if (ondisk->options.order > 8 * sizeof (int) - 1)
1022 return false;
1023
Alex Elder103a1502012-08-02 11:29:45 -05001024 /*
1025 * The size of a snapshot header has to fit in a size_t, and
1026 * that limits the number of snapshots.
1027 */
1028 snap_count = le32_to_cpu(ondisk->snap_count);
1029 size = SIZE_MAX - sizeof (struct ceph_snap_context);
1030 if (snap_count > size / sizeof (__le64))
1031 return false;
1032
1033 /*
1034 * Not only that, but the size of the entire the snapshot
1035 * header must also be representable in a size_t.
1036 */
1037 size -= snap_count * sizeof (__le64);
1038 if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
1039 return false;
1040
1041 return true;
Alex Elder8e94af82012-07-25 09:32:40 -05001042}
1043
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001044/*
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001045 * returns the size of an object in the image
1046 */
1047static u32 rbd_obj_bytes(struct rbd_image_header *header)
1048{
1049 return 1U << header->obj_order;
1050}
1051
Ilya Dryomov263423f2017-01-25 18:16:22 +01001052static void rbd_init_layout(struct rbd_device *rbd_dev)
1053{
1054 if (rbd_dev->header.stripe_unit == 0 ||
1055 rbd_dev->header.stripe_count == 0) {
1056 rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
1057 rbd_dev->header.stripe_count = 1;
1058 }
1059
1060 rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
1061 rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
1062 rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
Ilya Dryomov7e973322017-01-25 18:16:22 +01001063 rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
1064 rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001065 RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
1066}
1067
Ilya Dryomov5bc3fb12017-01-25 18:16:22 +01001068/*
Alex Elderbb23e372013-05-06 09:51:29 -05001069 * Fill an rbd image header with information from the given format 1
1070 * on-disk header.
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001071 */
Alex Elder662518b2013-05-06 09:51:29 -05001072static int rbd_header_from_disk(struct rbd_device *rbd_dev,
Alex Elder4156d992012-08-02 11:29:46 -05001073 struct rbd_image_header_ondisk *ondisk)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001074{
Alex Elder662518b2013-05-06 09:51:29 -05001075 struct rbd_image_header *header = &rbd_dev->header;
Alex Elderbb23e372013-05-06 09:51:29 -05001076 bool first_time = header->object_prefix == NULL;
1077 struct ceph_snap_context *snapc;
1078 char *object_prefix = NULL;
1079 char *snap_names = NULL;
1080 u64 *snap_sizes = NULL;
Alex Elderccece232012-07-10 20:30:10 -05001081 u32 snap_count;
Alex Elderbb23e372013-05-06 09:51:29 -05001082 int ret = -ENOMEM;
Alex Elder621901d2012-08-23 23:22:06 -05001083 u32 i;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001084
Alex Elderbb23e372013-05-06 09:51:29 -05001085 /* Allocate this now to avoid having to handle failure below */
1086
1087 if (first_time) {
Ilya Dryomov848d7962017-01-25 18:16:21 +01001088 object_prefix = kstrndup(ondisk->object_prefix,
1089 sizeof(ondisk->object_prefix),
1090 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001091 if (!object_prefix)
1092 return -ENOMEM;
Alex Elderbb23e372013-05-06 09:51:29 -05001093 }
1094
1095 /* Allocate the snapshot context and fill it in */
Alex Elder6a523252012-07-19 17:12:59 -05001096
Alex Elder103a1502012-08-02 11:29:45 -05001097 snap_count = le32_to_cpu(ondisk->snap_count);
Alex Elderbb23e372013-05-06 09:51:29 -05001098 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
1099 if (!snapc)
1100 goto out_err;
1101 snapc->seq = le64_to_cpu(ondisk->snap_seq);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001102 if (snap_count) {
Alex Elderbb23e372013-05-06 09:51:29 -05001103 struct rbd_image_snap_ondisk *snaps;
Alex Elderf785cc12012-08-23 23:22:06 -05001104 u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
1105
Alex Elderbb23e372013-05-06 09:51:29 -05001106 /* We'll keep a copy of the snapshot names... */
Alex Elder621901d2012-08-23 23:22:06 -05001107
Alex Elderbb23e372013-05-06 09:51:29 -05001108 if (snap_names_len > (u64)SIZE_MAX)
1109 goto out_2big;
1110 snap_names = kmalloc(snap_names_len, GFP_KERNEL);
1111 if (!snap_names)
Alex Elder6a523252012-07-19 17:12:59 -05001112 goto out_err;
Alex Elderbb23e372013-05-06 09:51:29 -05001113
1114 /* ...as well as the array of their sizes. */
Markus Elfring88a25a52016-09-11 12:21:25 +02001115 snap_sizes = kmalloc_array(snap_count,
1116 sizeof(*header->snap_sizes),
1117 GFP_KERNEL);
Alex Elderbb23e372013-05-06 09:51:29 -05001118 if (!snap_sizes)
1119 goto out_err;
1120
Alex Elderf785cc12012-08-23 23:22:06 -05001121 /*
Alex Elderbb23e372013-05-06 09:51:29 -05001122 * Copy the names, and fill in each snapshot's id
1123 * and size.
1124 *
Alex Elder99a41eb2013-05-06 09:51:30 -05001125 * Note that rbd_dev_v1_header_info() guarantees the
Alex Elderbb23e372013-05-06 09:51:29 -05001126 * ondisk buffer we're working with has
Alex Elderf785cc12012-08-23 23:22:06 -05001127 * snap_names_len bytes beyond the end of the
1128 * snapshot id array, this memcpy() is safe.
1129 */
Alex Elderbb23e372013-05-06 09:51:29 -05001130 memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
1131 snaps = ondisk->snaps;
1132 for (i = 0; i < snap_count; i++) {
1133 snapc->snaps[i] = le64_to_cpu(snaps[i].id);
1134 snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
1135 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001136 }
Alex Elder849b4262012-07-09 21:04:24 -05001137
Alex Elderbb23e372013-05-06 09:51:29 -05001138 /* We won't fail any more, fill in the header */
Alex Elder6a523252012-07-19 17:12:59 -05001139
Alex Elderbb23e372013-05-06 09:51:29 -05001140 if (first_time) {
1141 header->object_prefix = object_prefix;
1142 header->obj_order = ondisk->options.order;
Ilya Dryomov263423f2017-01-25 18:16:22 +01001143 rbd_init_layout(rbd_dev);
Alex Elder662518b2013-05-06 09:51:29 -05001144 } else {
1145 ceph_put_snap_context(header->snapc);
1146 kfree(header->snap_names);
1147 kfree(header->snap_sizes);
Alex Elderbb23e372013-05-06 09:51:29 -05001148 }
1149
1150 /* The remaining fields always get updated (when we refresh) */
Alex Elder621901d2012-08-23 23:22:06 -05001151
Alex Elderf84344f2012-08-31 17:29:51 -05001152 header->image_size = le64_to_cpu(ondisk->image_size);
Alex Elderbb23e372013-05-06 09:51:29 -05001153 header->snapc = snapc;
1154 header->snap_names = snap_names;
1155 header->snap_sizes = snap_sizes;
Alex Elder468521c2013-04-26 09:43:47 -05001156
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001157 return 0;
Alex Elderbb23e372013-05-06 09:51:29 -05001158out_2big:
1159 ret = -EIO;
Alex Elder6a523252012-07-19 17:12:59 -05001160out_err:
Alex Elderbb23e372013-05-06 09:51:29 -05001161 kfree(snap_sizes);
1162 kfree(snap_names);
1163 ceph_put_snap_context(snapc);
1164 kfree(object_prefix);
Alex Elderccece232012-07-10 20:30:10 -05001165
Alex Elderbb23e372013-05-06 09:51:29 -05001166 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001167}
1168
Alex Elder9682fc62013-04-30 00:44:33 -05001169static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
1170{
1171 const char *snap_name;
1172
1173 rbd_assert(which < rbd_dev->header.snapc->num_snaps);
1174
1175 /* Skip over names until we find the one we are looking for */
1176
1177 snap_name = rbd_dev->header.snap_names;
1178 while (which--)
1179 snap_name += strlen(snap_name) + 1;
1180
1181 return kstrdup(snap_name, GFP_KERNEL);
1182}
1183
Alex Elder30d1cff2013-05-01 12:43:03 -05001184/*
1185 * Snapshot id comparison function for use with qsort()/bsearch().
1186 * Note that result is for snapshots in *descending* order.
1187 */
1188static int snapid_compare_reverse(const void *s1, const void *s2)
1189{
1190 u64 snap_id1 = *(u64 *)s1;
1191 u64 snap_id2 = *(u64 *)s2;
1192
1193 if (snap_id1 < snap_id2)
1194 return 1;
1195 return snap_id1 == snap_id2 ? 0 : -1;
1196}
1197
1198/*
1199 * Search a snapshot context to see if the given snapshot id is
1200 * present.
1201 *
1202 * Returns the position of the snapshot id in the array if it's found,
1203 * or BAD_SNAP_INDEX otherwise.
1204 *
1205 * Note: The snapshot array is in kept sorted (by the osd) in
1206 * reverse order, highest snapshot id first.
1207 */
Alex Elder9682fc62013-04-30 00:44:33 -05001208static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
1209{
1210 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
Alex Elder30d1cff2013-05-01 12:43:03 -05001211 u64 *found;
Alex Elder9682fc62013-04-30 00:44:33 -05001212
Alex Elder30d1cff2013-05-01 12:43:03 -05001213 found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
1214 sizeof (snap_id), snapid_compare_reverse);
Alex Elder9682fc62013-04-30 00:44:33 -05001215
Alex Elder30d1cff2013-05-01 12:43:03 -05001216 return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
Alex Elder9682fc62013-04-30 00:44:33 -05001217}
1218
Alex Elder2ad3d712013-04-30 00:44:33 -05001219static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
1220 u64 snap_id)
Alex Elder54cac612013-04-30 00:44:33 -05001221{
1222 u32 which;
Josh Durginda6a6b62013-09-04 17:57:31 -07001223 const char *snap_name;
Alex Elder54cac612013-04-30 00:44:33 -05001224
1225 which = rbd_dev_snap_index(rbd_dev, snap_id);
1226 if (which == BAD_SNAP_INDEX)
Josh Durginda6a6b62013-09-04 17:57:31 -07001227 return ERR_PTR(-ENOENT);
Alex Elder54cac612013-04-30 00:44:33 -05001228
Josh Durginda6a6b62013-09-04 17:57:31 -07001229 snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
1230 return snap_name ? snap_name : ERR_PTR(-ENOMEM);
Alex Elder54cac612013-04-30 00:44:33 -05001231}
1232
Alex Elder9e15b772012-10-30 19:40:33 -05001233static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
1234{
Alex Elder9e15b772012-10-30 19:40:33 -05001235 if (snap_id == CEPH_NOSNAP)
1236 return RBD_SNAP_HEAD_NAME;
1237
Alex Elder54cac612013-04-30 00:44:33 -05001238 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1239 if (rbd_dev->image_format == 1)
1240 return rbd_dev_v1_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001241
Alex Elder54cac612013-04-30 00:44:33 -05001242 return rbd_dev_v2_snap_name(rbd_dev, snap_id);
Alex Elder9e15b772012-10-30 19:40:33 -05001243}
1244
Alex Elder2ad3d712013-04-30 00:44:33 -05001245static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
1246 u64 *snap_size)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001247{
Alex Elder2ad3d712013-04-30 00:44:33 -05001248 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1249 if (snap_id == CEPH_NOSNAP) {
1250 *snap_size = rbd_dev->header.image_size;
1251 } else if (rbd_dev->image_format == 1) {
1252 u32 which;
Alex Elder00f1f362012-02-07 12:03:36 -06001253
Alex Elder2ad3d712013-04-30 00:44:33 -05001254 which = rbd_dev_snap_index(rbd_dev, snap_id);
1255 if (which == BAD_SNAP_INDEX)
1256 return -ENOENT;
Alex Elder00f1f362012-02-07 12:03:36 -06001257
Alex Elder2ad3d712013-04-30 00:44:33 -05001258 *snap_size = rbd_dev->header.snap_sizes[which];
1259 } else {
1260 u64 size = 0;
1261 int ret;
1262
1263 ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
1264 if (ret)
1265 return ret;
1266
1267 *snap_size = size;
1268 }
1269 return 0;
1270}
1271
1272static int rbd_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
1273 u64 *snap_features)
1274{
1275 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
1276 if (snap_id == CEPH_NOSNAP) {
1277 *snap_features = rbd_dev->header.features;
1278 } else if (rbd_dev->image_format == 1) {
1279 *snap_features = 0; /* No features for format 1 */
1280 } else {
1281 u64 features = 0;
1282 int ret;
1283
1284 ret = _rbd_dev_v2_snap_features(rbd_dev, snap_id, &features);
1285 if (ret)
1286 return ret;
1287
1288 *snap_features = features;
1289 }
1290 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001291}
1292
Alex Elderd1cf5782013-04-27 09:59:30 -05001293static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001294{
Alex Elder8f4b7d92013-05-06 07:40:30 -05001295 u64 snap_id = rbd_dev->spec->snap_id;
Alex Elder2ad3d712013-04-30 00:44:33 -05001296 u64 size = 0;
1297 u64 features = 0;
1298 int ret;
Alex Elder8b0241f2013-04-25 23:15:08 -05001299
Alex Elder2ad3d712013-04-30 00:44:33 -05001300 ret = rbd_snap_size(rbd_dev, snap_id, &size);
1301 if (ret)
1302 return ret;
1303 ret = rbd_snap_features(rbd_dev, snap_id, &features);
1304 if (ret)
1305 return ret;
1306
1307 rbd_dev->mapping.size = size;
1308 rbd_dev->mapping.features = features;
1309
Alex Elder8b0241f2013-04-25 23:15:08 -05001310 return 0;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001311}
1312
Alex Elderd1cf5782013-04-27 09:59:30 -05001313static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
1314{
1315 rbd_dev->mapping.size = 0;
1316 rbd_dev->mapping.features = 0;
Alex Elder200a6a82013-04-28 23:32:34 -05001317}
1318
Ilya Dryomov5359a172018-01-20 10:30:10 +01001319static void zero_bvec(struct bio_vec *bv)
Alex Elder65ccfe22012-08-09 10:33:26 -07001320{
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001321 void *buf;
Ilya Dryomov5359a172018-01-20 10:30:10 +01001322 unsigned long flags;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001323
Ilya Dryomov5359a172018-01-20 10:30:10 +01001324 buf = bvec_kmap_irq(bv, &flags);
1325 memset(buf, 0, bv->bv_len);
1326 flush_dcache_page(bv->bv_page);
1327 bvec_kunmap_irq(buf, &flags);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001328}
1329
Ilya Dryomov5359a172018-01-20 10:30:10 +01001330static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
Alex Elderb9434c52013-04-19 15:34:50 -05001331{
Ilya Dryomov5359a172018-01-20 10:30:10 +01001332 struct ceph_bio_iter it = *bio_pos;
Alex Elderb9434c52013-04-19 15:34:50 -05001333
Ilya Dryomov5359a172018-01-20 10:30:10 +01001334 ceph_bio_iter_advance(&it, off);
1335 ceph_bio_iter_advance_step(&it, bytes, ({
1336 zero_bvec(&bv);
1337 }));
Alex Elderb9434c52013-04-19 15:34:50 -05001338}
1339
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001340static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001341{
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001342 struct ceph_bvec_iter it = *bvec_pos;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001343
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001344 ceph_bvec_iter_advance(&it, off);
1345 ceph_bvec_iter_advance_step(&it, bytes, ({
1346 zero_bvec(&bv);
1347 }));
Alex Elderf7760da2012-10-20 22:17:27 -05001348}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001349
Alex Elderf7760da2012-10-20 22:17:27 -05001350/*
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001351 * Zero a range in @obj_req data buffer defined by a bio (list) or
Ilya Dryomovafb97882018-02-06 19:26:35 +01001352 * (private) bio_vec array.
Alex Elderf7760da2012-10-20 22:17:27 -05001353 *
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001354 * @off is relative to the start of the data buffer.
Alex Elderf7760da2012-10-20 22:17:27 -05001355 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001356static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
1357 u32 bytes)
Alex Elderf7760da2012-10-20 22:17:27 -05001358{
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001359 dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
1360
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001361 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001362 case OBJ_REQUEST_BIO:
1363 zero_bios(&obj_req->bio_pos, off, bytes);
1364 break;
1365 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001366 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001367 zero_bvecs(&obj_req->bvec_pos, off, bytes);
1368 break;
1369 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001370 BUG();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07001371 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001372}
1373
1374static void rbd_obj_request_destroy(struct kref *kref);
1375static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
1376{
1377 rbd_assert(obj_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001378 dout("%s: obj %p (was %d)\n", __func__, obj_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001379 kref_read(&obj_request->kref));
Alex Elderbf0d5f502012-11-22 00:00:08 -06001380 kref_put(&obj_request->kref, rbd_obj_request_destroy);
1381}
1382
Alex Elderbf0d5f502012-11-22 00:00:08 -06001383static void rbd_img_request_destroy(struct kref *kref);
1384static void rbd_img_request_put(struct rbd_img_request *img_request)
1385{
1386 rbd_assert(img_request != NULL);
Alex Elder37206ee2013-02-20 17:32:08 -06001387 dout("%s: img %p (was %d)\n", __func__, img_request,
Peter Zijlstra2c935bc2016-11-14 17:29:48 +01001388 kref_read(&img_request->kref));
Ilya Dryomove93aca02018-02-06 19:26:35 +01001389 kref_put(&img_request->kref, rbd_img_request_destroy);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001390}
1391
1392static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
1393 struct rbd_obj_request *obj_request)
1394{
Alex Elder25dcf952013-01-25 17:08:55 -06001395 rbd_assert(obj_request->img_request == NULL);
1396
Alex Elderb155e862013-04-15 14:50:37 -05001397 /* Image request now owns object's original reference */
Alex Elderbf0d5f502012-11-22 00:00:08 -06001398 obj_request->img_request = img_request;
Ilya Dryomov15961b42018-02-01 11:50:47 +01001399 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001400}
1401
1402static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
1403 struct rbd_obj_request *obj_request)
1404{
Ilya Dryomov15961b42018-02-01 11:50:47 +01001405 dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001406 list_del(&obj_request->ex.oe_item);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001407 rbd_assert(obj_request->img_request == img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001408 rbd_obj_request_put(obj_request);
1409}
1410
Ilya Dryomov980917f2016-09-12 18:59:42 +02001411static void rbd_obj_request_submit(struct rbd_obj_request *obj_request)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001412{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001413 struct ceph_osd_request *osd_req =
1414 list_last_entry(&obj_request->osd_reqs, struct ceph_osd_request,
1415 r_private_item);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001416
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001417 dout("%s %p object_no %016llx %llu~%llu osd_req %p\n", __func__,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001418 obj_request, obj_request->ex.oe_objno, obj_request->ex.oe_off,
1419 obj_request->ex.oe_len, osd_req);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001420 ceph_osdc_start_request(osd_req->r_osdc, osd_req, false);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001421}
1422
Alex Elder0c425242013-02-08 09:55:49 -06001423/*
1424 * The default/initial value for all image request flags is 0. Each
1425 * is conditionally set to 1 at image request initialization time
1426 * and currently never change thereafter.
1427 */
Alex Elderd0b2e942013-01-24 16:13:36 -06001428static void img_request_layered_set(struct rbd_img_request *img_request)
1429{
1430 set_bit(IMG_REQ_LAYERED, &img_request->flags);
1431 smp_mb();
1432}
1433
Alex Eldera2acd002013-05-08 22:50:04 -05001434static void img_request_layered_clear(struct rbd_img_request *img_request)
1435{
1436 clear_bit(IMG_REQ_LAYERED, &img_request->flags);
1437 smp_mb();
1438}
1439
Alex Elderd0b2e942013-01-24 16:13:36 -06001440static bool img_request_layered_test(struct rbd_img_request *img_request)
1441{
1442 smp_mb();
1443 return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
1444}
1445
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001446static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001447{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001448 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1449
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001450 return !obj_req->ex.oe_off &&
1451 obj_req->ex.oe_len == rbd_dev->layout.object_size;
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001452}
1453
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001454static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
Alex Elder6e2a4502013-03-27 09:16:30 -05001455{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001456 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderb9434c52013-04-19 15:34:50 -05001457
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001458 return obj_req->ex.oe_off + obj_req->ex.oe_len ==
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001459 rbd_dev->layout.object_size;
1460}
1461
Ilya Dryomov13488d52019-02-25 12:37:50 +01001462/*
1463 * Must be called after rbd_obj_calc_img_extents().
1464 */
1465static bool rbd_obj_copyup_enabled(struct rbd_obj_request *obj_req)
1466{
1467 if (!obj_req->num_img_extents ||
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001468 (rbd_obj_is_entire(obj_req) &&
1469 !obj_req->img_request->snapc->num_snaps))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001470 return false;
1471
1472 return true;
1473}
1474
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001475static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
1476{
1477 return ceph_file_extents_bytes(obj_req->img_extents,
1478 obj_req->num_img_extents);
1479}
1480
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001481static bool rbd_img_is_write(struct rbd_img_request *img_req)
1482{
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001483 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001484 case OBJ_OP_READ:
1485 return false;
1486 case OBJ_OP_WRITE:
1487 case OBJ_OP_DISCARD:
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001488 case OBJ_OP_ZEROOUT:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001489 return true;
1490 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02001491 BUG();
Alex Elder6e2a4502013-03-27 09:16:30 -05001492 }
Alex Elder6e2a4502013-03-27 09:16:30 -05001493}
1494
Ilya Dryomov85e084f2016-04-28 16:07:24 +02001495static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001496{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001497 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001498 int result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001499
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001500 dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
1501 osd_req->r_result, obj_req);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001502
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001503 /*
1504 * Writes aren't allowed to return a data payload. In some
1505 * guarded write cases (e.g. stat + zero on an empty object)
1506 * a stat response makes it through, but we don't care.
1507 */
1508 if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
1509 result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001510 else
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001511 result = osd_req->r_result;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001512
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02001513 rbd_obj_handle_request(obj_req, result);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001514}
1515
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001516static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
Alex Elder430c28c2013-04-03 21:32:51 -05001517{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001518 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder430c28c2013-04-03 21:32:51 -05001519
Ilya Dryomova162b302018-01-30 17:52:10 +01001520 osd_req->r_flags = CEPH_OSD_FLAG_READ;
Ilya Dryomov7c848832016-09-15 17:56:39 +02001521 osd_req->r_snapid = obj_request->img_request->snap_id;
Alex Elder9d4df012013-04-19 15:34:50 -05001522}
1523
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001524static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
Alex Elder9d4df012013-04-19 15:34:50 -05001525{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001526 struct rbd_obj_request *obj_request = osd_req->r_priv;
Alex Elder9d4df012013-04-19 15:34:50 -05001527
Ilya Dryomova162b302018-01-30 17:52:10 +01001528 osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001529 ktime_get_real_ts64(&osd_req->r_mtime);
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001530 osd_req->r_data_offset = obj_request->ex.oe_off;
Alex Elder430c28c2013-04-03 21:32:51 -05001531}
1532
Ilya Dryomovbc812072017-01-25 18:16:23 +01001533static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001534__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
1535 struct ceph_snap_context *snapc, int num_ops)
Ilya Dryomovbc812072017-01-25 18:16:23 +01001536{
Ilya Dryomove28eded2019-02-25 11:42:26 +01001537 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001538 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
1539 struct ceph_osd_request *req;
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001540 const char *name_format = rbd_dev->image_format == 1 ?
1541 RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001542 int ret;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001543
Ilya Dryomove28eded2019-02-25 11:42:26 +01001544 req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001545 if (!req)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001546 return ERR_PTR(-ENOMEM);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001547
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001548 list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001549 req->r_callback = rbd_osd_req_callback;
Ilya Dryomova162b302018-01-30 17:52:10 +01001550 req->r_priv = obj_req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001551
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001552 /*
1553 * Data objects may be stored in a separate pool, but always in
1554 * the same namespace in that pool as the header in its pool.
1555 */
1556 ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001557 req->r_base_oloc.pool = rbd_dev->layout.pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02001558
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001559 ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
1560 rbd_dev->header.object_prefix,
1561 obj_req->ex.oe_objno);
1562 if (ret)
1563 return ERR_PTR(ret);
Ilya Dryomovbc812072017-01-25 18:16:23 +01001564
Ilya Dryomovbc812072017-01-25 18:16:23 +01001565 return req;
Ilya Dryomovbc812072017-01-25 18:16:23 +01001566}
1567
Ilya Dryomove28eded2019-02-25 11:42:26 +01001568static struct ceph_osd_request *
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001569rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
Ilya Dryomove28eded2019-02-25 11:42:26 +01001570{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001571 return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
1572 num_ops);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001573}
1574
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001575static struct rbd_obj_request *rbd_obj_request_create(void)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001576{
1577 struct rbd_obj_request *obj_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001578
Ilya Dryomov5a60e872015-06-24 17:24:33 +03001579 obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
Ilya Dryomov6c696d82017-01-25 18:16:23 +01001580 if (!obj_request)
Alex Elderf907ad52013-05-01 12:43:03 -05001581 return NULL;
Alex Elderf907ad52013-05-01 12:43:03 -05001582
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001583 ceph_object_extent_init(&obj_request->ex);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001584 INIT_LIST_HEAD(&obj_request->osd_reqs);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001585 mutex_init(&obj_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001586 kref_init(&obj_request->kref);
1587
Ilya Dryomov67e2b652017-01-25 18:16:22 +01001588 dout("%s %p\n", __func__, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001589 return obj_request;
1590}
1591
1592static void rbd_obj_request_destroy(struct kref *kref)
1593{
1594 struct rbd_obj_request *obj_request;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001595 struct ceph_osd_request *osd_req;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001596 u32 i;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001597
1598 obj_request = container_of(kref, struct rbd_obj_request, kref);
1599
Alex Elder37206ee2013-02-20 17:32:08 -06001600 dout("%s: obj %p\n", __func__, obj_request);
1601
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001602 while (!list_empty(&obj_request->osd_reqs)) {
1603 osd_req = list_first_entry(&obj_request->osd_reqs,
1604 struct ceph_osd_request, r_private_item);
1605 list_del_init(&osd_req->r_private_item);
1606 ceph_osdc_put_request(osd_req);
1607 }
Alex Elderbf0d5f502012-11-22 00:00:08 -06001608
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001609 switch (obj_request->img_request->data_type) {
Alex Elder9969ebc2013-01-18 12:31:10 -06001610 case OBJ_REQUEST_NODATA:
Alex Elderbf0d5f502012-11-22 00:00:08 -06001611 case OBJ_REQUEST_BIO:
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001612 case OBJ_REQUEST_BVECS:
Ilya Dryomov5359a172018-01-20 10:30:10 +01001613 break; /* Nothing to do */
Ilya Dryomovafb97882018-02-06 19:26:35 +01001614 case OBJ_REQUEST_OWN_BVECS:
1615 kfree(obj_request->bvec_pos.bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001616 break;
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001617 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001618 BUG();
Alex Elderbf0d5f502012-11-22 00:00:08 -06001619 }
1620
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001621 kfree(obj_request->img_extents);
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01001622 if (obj_request->copyup_bvecs) {
1623 for (i = 0; i < obj_request->copyup_bvec_count; i++) {
1624 if (obj_request->copyup_bvecs[i].bv_page)
1625 __free_page(obj_request->copyup_bvecs[i].bv_page);
1626 }
1627 kfree(obj_request->copyup_bvecs);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001628 }
1629
Alex Elder868311b2013-05-01 12:43:03 -05001630 kmem_cache_free(rbd_obj_request_cache, obj_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001631}
1632
Alex Elderfb65d2282013-05-08 22:50:04 -05001633/* It's OK to call this for a device with no parent */
1634
1635static void rbd_spec_put(struct rbd_spec *spec);
1636static void rbd_dev_unparent(struct rbd_device *rbd_dev)
1637{
1638 rbd_dev_remove_parent(rbd_dev);
1639 rbd_spec_put(rbd_dev->parent_spec);
1640 rbd_dev->parent_spec = NULL;
1641 rbd_dev->parent_overlap = 0;
1642}
1643
Alex Elderbf0d5f502012-11-22 00:00:08 -06001644/*
Alex Eldera2acd002013-05-08 22:50:04 -05001645 * Parent image reference counting is used to determine when an
1646 * image's parent fields can be safely torn down--after there are no
1647 * more in-flight requests to the parent image. When the last
1648 * reference is dropped, cleaning them up is safe.
1649 */
1650static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
1651{
1652 int counter;
1653
1654 if (!rbd_dev->parent_spec)
1655 return;
1656
1657 counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
1658 if (counter > 0)
1659 return;
1660
1661 /* Last reference; clean up parent data structures */
1662
1663 if (!counter)
1664 rbd_dev_unparent(rbd_dev);
1665 else
Ilya Dryomov9584d502014-07-11 12:11:20 +04001666 rbd_warn(rbd_dev, "parent reference underflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001667}
1668
1669/*
1670 * If an image has a non-zero parent overlap, get a reference to its
1671 * parent.
1672 *
1673 * Returns true if the rbd device has a parent with a non-zero
1674 * overlap and a reference for it was successfully taken, or
1675 * false otherwise.
1676 */
1677static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
1678{
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001679 int counter = 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001680
1681 if (!rbd_dev->parent_spec)
1682 return false;
1683
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001684 down_read(&rbd_dev->header_rwsem);
1685 if (rbd_dev->parent_overlap)
1686 counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
1687 up_read(&rbd_dev->header_rwsem);
Alex Eldera2acd002013-05-08 22:50:04 -05001688
1689 if (counter < 0)
Ilya Dryomov9584d502014-07-11 12:11:20 +04001690 rbd_warn(rbd_dev, "parent reference overflow");
Alex Eldera2acd002013-05-08 22:50:04 -05001691
Ilya Dryomovae43e9d2015-01-19 18:13:43 +03001692 return counter > 0;
Alex Eldera2acd002013-05-08 22:50:04 -05001693}
1694
Alex Elderbf0d5f502012-11-22 00:00:08 -06001695/*
1696 * Caller is responsible for filling in the list of object requests
1697 * that comprises the image request, and the Linux request pointer
1698 * (if there is one).
1699 */
Alex Eldercc344fa2013-02-19 12:25:56 -06001700static struct rbd_img_request *rbd_img_request_create(
1701 struct rbd_device *rbd_dev,
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08001702 enum obj_operation_type op_type,
Josh Durgin4e752f02014-04-08 11:12:11 -07001703 struct ceph_snap_context *snapc)
Alex Elderbf0d5f502012-11-22 00:00:08 -06001704{
1705 struct rbd_img_request *img_request;
Alex Elderbf0d5f502012-11-22 00:00:08 -06001706
Ilya Dryomova0c58952018-01-22 16:03:06 +01001707 img_request = kmem_cache_zalloc(rbd_img_request_cache, GFP_NOIO);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001708 if (!img_request)
1709 return NULL;
1710
Alex Elderbf0d5f502012-11-22 00:00:08 -06001711 img_request->rbd_dev = rbd_dev;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001712 img_request->op_type = op_type;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001713 if (!rbd_img_is_write(img_request))
Alex Elderbf0d5f502012-11-22 00:00:08 -06001714 img_request->snap_id = rbd_dev->spec->snap_id;
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001715 else
1716 img_request->snapc = snapc;
1717
Alex Eldera2acd002013-05-08 22:50:04 -05001718 if (rbd_dev_parent_get(rbd_dev))
Alex Elderd0b2e942013-01-24 16:13:36 -06001719 img_request_layered_set(img_request);
Ilya Dryomova0c58952018-01-22 16:03:06 +01001720
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001721 INIT_LIST_HEAD(&img_request->object_extents);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02001722 mutex_init(&img_request->state_mutex);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001723 kref_init(&img_request->kref);
1724
Ilya Dryomovdfd98752018-02-06 19:26:35 +01001725 dout("%s: rbd_dev %p %s -> img %p\n", __func__, rbd_dev,
1726 obj_op_name(op_type), img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001727 return img_request;
1728}
1729
1730static void rbd_img_request_destroy(struct kref *kref)
1731{
1732 struct rbd_img_request *img_request;
1733 struct rbd_obj_request *obj_request;
1734 struct rbd_obj_request *next_obj_request;
1735
1736 img_request = container_of(kref, struct rbd_img_request, kref);
1737
Alex Elder37206ee2013-02-20 17:32:08 -06001738 dout("%s: img %p\n", __func__, img_request);
1739
Alex Elderbf0d5f502012-11-22 00:00:08 -06001740 for_each_obj_request_safe(img_request, obj_request, next_obj_request)
1741 rbd_img_obj_request_del(img_request, obj_request);
1742
Alex Eldera2acd002013-05-08 22:50:04 -05001743 if (img_request_layered_test(img_request)) {
1744 img_request_layered_clear(img_request);
1745 rbd_dev_parent_put(img_request->rbd_dev);
1746 }
1747
Ilya Dryomov9bb02482018-01-30 17:52:10 +01001748 if (rbd_img_is_write(img_request))
Alex Elder812164f82013-04-30 00:44:32 -05001749 ceph_put_snap_context(img_request->snapc);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001750
Alex Elder1c2a9df2013-05-01 12:43:03 -05001751 kmem_cache_free(rbd_img_request_cache, img_request);
Alex Elderbf0d5f502012-11-22 00:00:08 -06001752}
1753
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001754static void prune_extents(struct ceph_file_extent *img_extents,
1755 u32 *num_img_extents, u64 overlap)
Alex Eldere93f3152013-05-08 22:50:04 -05001756{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001757 u32 cnt = *num_img_extents;
Alex Eldere93f3152013-05-08 22:50:04 -05001758
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001759 /* drop extents completely beyond the overlap */
1760 while (cnt && img_extents[cnt - 1].fe_off >= overlap)
1761 cnt--;
Alex Eldere93f3152013-05-08 22:50:04 -05001762
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001763 if (cnt) {
1764 struct ceph_file_extent *ex = &img_extents[cnt - 1];
Alex Eldere93f3152013-05-08 22:50:04 -05001765
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001766 /* trim final overlapping extent */
1767 if (ex->fe_off + ex->fe_len > overlap)
1768 ex->fe_len = overlap - ex->fe_off;
Alex Elder12178572013-02-08 09:55:49 -06001769 }
1770
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001771 *num_img_extents = cnt;
Alex Elder21692382013-04-05 01:27:12 -05001772}
1773
Alex Elderf1a47392013-04-19 15:34:50 -05001774/*
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001775 * Determine the byte range(s) covered by either just the object extent
1776 * or the entire object in the parent image.
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001777 */
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001778static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
1779 bool entire)
Josh Durgin3b434a2a2014-04-04 17:32:15 -07001780{
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001781 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001782 int ret;
1783
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001784 if (!rbd_dev->parent_overlap)
1785 return 0;
1786
1787 ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
1788 entire ? 0 : obj_req->ex.oe_off,
1789 entire ? rbd_dev->layout.object_size :
1790 obj_req->ex.oe_len,
1791 &obj_req->img_extents,
1792 &obj_req->num_img_extents);
1793 if (ret)
1794 return ret;
1795
1796 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
1797 rbd_dev->parent_overlap);
1798 return 0;
1799}
1800
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001801static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001802{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001803 struct rbd_obj_request *obj_req = osd_req->r_priv;
1804
Ilya Dryomovecc633c2018-02-01 11:50:47 +01001805 switch (obj_req->img_request->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001806 case OBJ_REQUEST_BIO:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001807 osd_req_op_extent_osd_data_bio(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001808 &obj_req->bio_pos,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001809 obj_req->ex.oe_len);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001810 break;
1811 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01001812 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001813 rbd_assert(obj_req->bvec_pos.iter.bi_size ==
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001814 obj_req->ex.oe_len);
Ilya Dryomovafb97882018-02-06 19:26:35 +01001815 rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001816 osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001817 &obj_req->bvec_pos);
1818 break;
1819 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01001820 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001821 }
1822}
1823
1824static int rbd_obj_setup_read(struct rbd_obj_request *obj_req)
1825{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001826 struct ceph_osd_request *osd_req;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001827
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001828 osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
1829 if (IS_ERR(osd_req))
1830 return PTR_ERR(osd_req);
1831
1832 osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001833 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001834 rbd_osd_setup_data(osd_req, 0);
Ilya Dryomova90bb0c2017-01-25 18:16:23 +01001835
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001836 rbd_osd_format_read(osd_req);
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001837 obj_req->read_state = RBD_OBJ_READ_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001838 return 0;
1839}
1840
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001841static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001842{
1843 struct page **pages;
Ilya Dryomov710214e2016-09-15 17:53:32 +02001844
Alex Elderc5b5ef62013-02-11 12:33:24 -06001845 /*
1846 * The response data for a STAT call consists of:
1847 * le64 length;
1848 * struct {
1849 * le32 tv_sec;
1850 * le32 tv_nsec;
1851 * } mtime;
1852 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001853 pages = ceph_alloc_page_vector(1, GFP_NOIO);
1854 if (IS_ERR(pages))
1855 return PTR_ERR(pages);
Alex Elderc5b5ef62013-02-11 12:33:24 -06001856
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001857 osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
1858 osd_req_op_raw_data_in_pages(osd_req, which, pages,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001859 8 + sizeof(struct ceph_timespec),
1860 0, false, true);
Ilya Dryomov980917f2016-09-12 18:59:42 +02001861 return 0;
Alex Elderc5b5ef62013-02-11 12:33:24 -06001862}
1863
Ilya Dryomov13488d52019-02-25 12:37:50 +01001864static int count_write_ops(struct rbd_obj_request *obj_req)
1865{
1866 return 2; /* setallochint + write/writefull */
1867}
1868
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001869static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
1870 int which)
Alex Elderb454e362013-04-19 15:34:50 -05001871{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001872 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001873 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
1874 u16 opcode;
Alex Elderb454e362013-04-19 15:34:50 -05001875
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001876 osd_req_op_alloc_hint_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001877 rbd_dev->layout.object_size,
1878 rbd_dev->layout.object_size);
Alex Elderb454e362013-04-19 15:34:50 -05001879
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001880 if (rbd_obj_is_entire(obj_req))
1881 opcode = CEPH_OSD_OP_WRITEFULL;
1882 else
1883 opcode = CEPH_OSD_OP_WRITE;
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001884
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001885 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01001886 obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001887 rbd_osd_setup_data(osd_req, which);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001888
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001889 rbd_osd_format_write(osd_req);
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001890}
1891
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001892static int rbd_obj_setup_write(struct rbd_obj_request *obj_req)
Ilya Dryomov70d045f2014-09-12 16:02:01 +04001893{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001894 struct ceph_osd_request *osd_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001895 unsigned int num_osd_ops, which = 0;
1896 int ret;
Ilya Dryomov058aa992016-09-12 14:44:45 +02001897
Ilya Dryomov86bd7992018-02-06 19:26:33 +01001898 /* reverse map the entire object onto the parent */
1899 ret = rbd_obj_calc_img_extents(obj_req, true);
1900 if (ret)
1901 return ret;
1902
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001903 if (rbd_obj_copyup_enabled(obj_req))
1904 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
1905
1906 num_osd_ops = count_write_ops(obj_req);
1907 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
1908 num_osd_ops++; /* stat */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001909
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001910 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
1911 if (IS_ERR(osd_req))
1912 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001913
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001914 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001915 ret = rbd_osd_setup_stat(osd_req, which++);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001916 if (ret)
1917 return ret;
1918 }
1919
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001920 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001921 __rbd_osd_setup_write_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001922 return 0;
1923}
1924
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001925static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
1926{
1927 return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
1928 CEPH_OSD_OP_ZERO;
1929}
1930
1931static int rbd_obj_setup_discard(struct rbd_obj_request *obj_req)
1932{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001933 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001934 struct ceph_osd_request *osd_req;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001935 u64 off = obj_req->ex.oe_off;
1936 u64 next_off = obj_req->ex.oe_off + obj_req->ex.oe_len;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001937 int ret;
1938
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001939 /*
1940 * Align the range to alloc_size boundary and punt on discards
1941 * that are too small to free up any space.
1942 *
1943 * alloc_size == object_size && is_tail() is a special case for
1944 * filestore with filestore_punch_hole = false, needed to allow
1945 * truncate (in addition to delete).
1946 */
1947 if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
1948 !rbd_obj_is_tail(obj_req)) {
1949 off = round_up(off, rbd_dev->opts->alloc_size);
1950 next_off = round_down(next_off, rbd_dev->opts->alloc_size);
1951 if (off >= next_off)
1952 return 1;
1953 }
1954
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001955 /* reverse map the entire object onto the parent */
1956 ret = rbd_obj_calc_img_extents(obj_req, true);
1957 if (ret)
1958 return ret;
1959
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001960 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
1961 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
1962
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001963 osd_req = rbd_obj_add_osd_request(obj_req, 1);
1964 if (IS_ERR(osd_req))
1965 return PTR_ERR(osd_req);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001966
1967 if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02001968 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001969 osd_req_op_init(osd_req, 0, CEPH_OSD_OP_DELETE, 0);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001970 } else {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001971 dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
1972 obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
1973 off, next_off - off);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001974 osd_req_op_extent_init(osd_req, 0,
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001975 truncate_or_zero_opcode(obj_req),
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01001976 off, next_off - off, 0, 0);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001977 }
1978
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02001979 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001980 rbd_osd_format_write(osd_req);
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01001981 return 0;
1982}
1983
Ilya Dryomov13488d52019-02-25 12:37:50 +01001984static int count_zeroout_ops(struct rbd_obj_request *obj_req)
1985{
1986 int num_osd_ops;
1987
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01001988 if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
1989 !rbd_obj_copyup_enabled(obj_req))
Ilya Dryomov13488d52019-02-25 12:37:50 +01001990 num_osd_ops = 2; /* create + truncate */
1991 else
1992 num_osd_ops = 1; /* delete/truncate/zero */
1993
1994 return num_osd_ops;
1995}
1996
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02001997static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
1998 int which)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01001999{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002000 struct rbd_obj_request *obj_req = osd_req->r_priv;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002001 u16 opcode;
2002
2003 if (rbd_obj_is_entire(obj_req)) {
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002004 if (obj_req->num_img_extents) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002005 if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002006 osd_req_op_init(osd_req, which++,
Ilya Dryomov9b17eb22019-02-28 15:51:39 +01002007 CEPH_OSD_OP_CREATE, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002008 opcode = CEPH_OSD_OP_TRUNCATE;
2009 } else {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002010 rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002011 osd_req_op_init(osd_req, which++,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002012 CEPH_OSD_OP_DELETE, 0);
2013 opcode = 0;
2014 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002015 } else {
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002016 opcode = truncate_or_zero_opcode(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002017 }
2018
2019 if (opcode)
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002020 osd_req_op_extent_init(osd_req, which, opcode,
Ilya Dryomov43df3d32018-02-02 15:23:22 +01002021 obj_req->ex.oe_off, obj_req->ex.oe_len,
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002022 0, 0);
2023
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002024 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002025}
2026
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002027static int rbd_obj_setup_zeroout(struct rbd_obj_request *obj_req)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002028{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002029 struct ceph_osd_request *osd_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002030 unsigned int num_osd_ops, which = 0;
2031 int ret;
2032
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002033 /* reverse map the entire object onto the parent */
2034 ret = rbd_obj_calc_img_extents(obj_req, true);
2035 if (ret)
2036 return ret;
2037
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002038 if (rbd_obj_copyup_enabled(obj_req))
2039 obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
2040 if (!obj_req->num_img_extents) {
2041 if (rbd_obj_is_entire(obj_req))
2042 obj_req->flags |= RBD_OBJ_FLAG_DELETION;
2043 }
2044
2045 num_osd_ops = count_zeroout_ops(obj_req);
2046 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
2047 num_osd_ops++; /* stat */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002048
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002049 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
2050 if (IS_ERR(osd_req))
2051 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002052
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002053 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002054 ret = rbd_osd_setup_stat(osd_req, which++);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002055 if (ret)
2056 return ret;
2057 }
2058
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002059 obj_req->write_state = RBD_OBJ_WRITE_START;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002060 __rbd_osd_setup_zeroout_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002061 return 0;
2062}
2063
2064/*
2065 * For each object request in @img_req, allocate an OSD request, add
2066 * individual OSD ops and prepare them for submission. The number of
2067 * OSD ops depends on op_type and the overlap point (if any).
2068 */
2069static int __rbd_img_fill_request(struct rbd_img_request *img_req)
2070{
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002071 struct rbd_obj_request *obj_req, *next_obj_req;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002072 struct ceph_osd_request *osd_req;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002073 int ret;
2074
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002075 for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
Ilya Dryomov9bb02482018-01-30 17:52:10 +01002076 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002077 case OBJ_OP_READ:
2078 ret = rbd_obj_setup_read(obj_req);
2079 break;
2080 case OBJ_OP_WRITE:
2081 ret = rbd_obj_setup_write(obj_req);
2082 break;
2083 case OBJ_OP_DISCARD:
2084 ret = rbd_obj_setup_discard(obj_req);
2085 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002086 case OBJ_OP_ZEROOUT:
2087 ret = rbd_obj_setup_zeroout(obj_req);
2088 break;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002089 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002090 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002091 }
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002092 if (ret < 0)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002093 return ret;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002094 if (ret > 0) {
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01002095 rbd_img_obj_request_del(img_req, obj_req);
2096 continue;
2097 }
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002098
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002099 osd_req = list_last_entry(&obj_req->osd_reqs,
2100 struct ceph_osd_request,
2101 r_private_item);
2102 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002103 if (ret)
2104 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002105 }
2106
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002107 img_req->state = RBD_IMG_START;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002108 return 0;
2109}
2110
Ilya Dryomov5a237812018-02-06 19:26:34 +01002111union rbd_img_fill_iter {
2112 struct ceph_bio_iter bio_iter;
2113 struct ceph_bvec_iter bvec_iter;
2114};
2115
2116struct rbd_img_fill_ctx {
2117 enum obj_request_type pos_type;
2118 union rbd_img_fill_iter *pos;
2119 union rbd_img_fill_iter iter;
2120 ceph_object_extent_fn_t set_pos_fn;
Ilya Dryomovafb97882018-02-06 19:26:35 +01002121 ceph_object_extent_fn_t count_fn;
2122 ceph_object_extent_fn_t copy_fn;
Ilya Dryomov5a237812018-02-06 19:26:34 +01002123};
2124
2125static struct ceph_object_extent *alloc_object_extent(void *arg)
2126{
2127 struct rbd_img_request *img_req = arg;
2128 struct rbd_obj_request *obj_req;
2129
2130 obj_req = rbd_obj_request_create();
2131 if (!obj_req)
2132 return NULL;
2133
2134 rbd_img_obj_request_add(img_req, obj_req);
2135 return &obj_req->ex;
2136}
2137
2138/*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002139 * While su != os && sc == 1 is technically not fancy (it's the same
2140 * layout as su == os && sc == 1), we can't use the nocopy path for it
2141 * because ->set_pos_fn() should be called only once per object.
2142 * ceph_file_to_extents() invokes action_fn once per stripe unit, so
2143 * treat su != os && sc == 1 as fancy.
Ilya Dryomov5a237812018-02-06 19:26:34 +01002144 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002145static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
2146{
2147 return l->stripe_unit != l->object_size;
2148}
2149
2150static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
2151 struct ceph_file_extent *img_extents,
2152 u32 num_img_extents,
2153 struct rbd_img_fill_ctx *fctx)
Ilya Dryomov5a237812018-02-06 19:26:34 +01002154{
2155 u32 i;
2156 int ret;
2157
2158 img_req->data_type = fctx->pos_type;
2159
2160 /*
2161 * Create object requests and set each object request's starting
2162 * position in the provided bio (list) or bio_vec array.
2163 */
2164 fctx->iter = *fctx->pos;
2165 for (i = 0; i < num_img_extents; i++) {
2166 ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
2167 img_extents[i].fe_off,
2168 img_extents[i].fe_len,
2169 &img_req->object_extents,
2170 alloc_object_extent, img_req,
2171 fctx->set_pos_fn, &fctx->iter);
2172 if (ret)
2173 return ret;
2174 }
2175
2176 return __rbd_img_fill_request(img_req);
2177}
2178
Ilya Dryomovafb97882018-02-06 19:26:35 +01002179/*
2180 * Map a list of image extents to a list of object extents, create the
2181 * corresponding object requests (normally each to a different object,
2182 * but not always) and add them to @img_req. For each object request,
2183 * set up its data descriptor to point to the corresponding chunk(s) of
2184 * @fctx->pos data buffer.
2185 *
2186 * Because ceph_file_to_extents() will merge adjacent object extents
2187 * together, each object request's data descriptor may point to multiple
2188 * different chunks of @fctx->pos data buffer.
2189 *
2190 * @fctx->pos data buffer is assumed to be large enough.
2191 */
2192static int rbd_img_fill_request(struct rbd_img_request *img_req,
2193 struct ceph_file_extent *img_extents,
2194 u32 num_img_extents,
2195 struct rbd_img_fill_ctx *fctx)
2196{
2197 struct rbd_device *rbd_dev = img_req->rbd_dev;
2198 struct rbd_obj_request *obj_req;
2199 u32 i;
2200 int ret;
2201
2202 if (fctx->pos_type == OBJ_REQUEST_NODATA ||
2203 !rbd_layout_is_fancy(&rbd_dev->layout))
2204 return rbd_img_fill_request_nocopy(img_req, img_extents,
2205 num_img_extents, fctx);
2206
2207 img_req->data_type = OBJ_REQUEST_OWN_BVECS;
2208
2209 /*
2210 * Create object requests and determine ->bvec_count for each object
2211 * request. Note that ->bvec_count sum over all object requests may
2212 * be greater than the number of bio_vecs in the provided bio (list)
2213 * or bio_vec array because when mapped, those bio_vecs can straddle
2214 * stripe unit boundaries.
2215 */
2216 fctx->iter = *fctx->pos;
2217 for (i = 0; i < num_img_extents; i++) {
2218 ret = ceph_file_to_extents(&rbd_dev->layout,
2219 img_extents[i].fe_off,
2220 img_extents[i].fe_len,
2221 &img_req->object_extents,
2222 alloc_object_extent, img_req,
2223 fctx->count_fn, &fctx->iter);
2224 if (ret)
2225 return ret;
2226 }
2227
2228 for_each_obj_request(img_req, obj_req) {
2229 obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
2230 sizeof(*obj_req->bvec_pos.bvecs),
2231 GFP_NOIO);
2232 if (!obj_req->bvec_pos.bvecs)
2233 return -ENOMEM;
Alex Elderb454e362013-04-19 15:34:50 -05002234 }
2235
2236 /*
Ilya Dryomovafb97882018-02-06 19:26:35 +01002237 * Fill in each object request's private bio_vec array, splitting and
2238 * rearranging the provided bio_vecs in stripe unit chunks as needed.
Alex Elderb454e362013-04-19 15:34:50 -05002239 */
Ilya Dryomovafb97882018-02-06 19:26:35 +01002240 fctx->iter = *fctx->pos;
2241 for (i = 0; i < num_img_extents; i++) {
2242 ret = ceph_iterate_extents(&rbd_dev->layout,
2243 img_extents[i].fe_off,
2244 img_extents[i].fe_len,
2245 &img_req->object_extents,
2246 fctx->copy_fn, &fctx->iter);
2247 if (ret)
2248 return ret;
2249 }
Alex Elder3d7efd12013-04-19 15:34:50 -05002250
Ilya Dryomovafb97882018-02-06 19:26:35 +01002251 return __rbd_img_fill_request(img_req);
Alex Elderb454e362013-04-19 15:34:50 -05002252}
2253
Ilya Dryomov5a237812018-02-06 19:26:34 +01002254static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
2255 u64 off, u64 len)
2256{
2257 struct ceph_file_extent ex = { off, len };
2258 union rbd_img_fill_iter dummy;
2259 struct rbd_img_fill_ctx fctx = {
2260 .pos_type = OBJ_REQUEST_NODATA,
2261 .pos = &dummy,
2262 };
2263
2264 return rbd_img_fill_request(img_req, &ex, 1, &fctx);
2265}
2266
2267static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2268{
2269 struct rbd_obj_request *obj_req =
2270 container_of(ex, struct rbd_obj_request, ex);
2271 struct ceph_bio_iter *it = arg;
2272
2273 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2274 obj_req->bio_pos = *it;
2275 ceph_bio_iter_advance(it, bytes);
2276}
2277
Ilya Dryomovafb97882018-02-06 19:26:35 +01002278static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2279{
2280 struct rbd_obj_request *obj_req =
2281 container_of(ex, struct rbd_obj_request, ex);
2282 struct ceph_bio_iter *it = arg;
2283
2284 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2285 ceph_bio_iter_advance_step(it, bytes, ({
2286 obj_req->bvec_count++;
2287 }));
2288
2289}
2290
2291static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2292{
2293 struct rbd_obj_request *obj_req =
2294 container_of(ex, struct rbd_obj_request, ex);
2295 struct ceph_bio_iter *it = arg;
2296
2297 dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
2298 ceph_bio_iter_advance_step(it, bytes, ({
2299 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2300 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2301 }));
2302}
2303
Ilya Dryomov5a237812018-02-06 19:26:34 +01002304static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2305 struct ceph_file_extent *img_extents,
2306 u32 num_img_extents,
2307 struct ceph_bio_iter *bio_pos)
2308{
2309 struct rbd_img_fill_ctx fctx = {
2310 .pos_type = OBJ_REQUEST_BIO,
2311 .pos = (union rbd_img_fill_iter *)bio_pos,
2312 .set_pos_fn = set_bio_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002313 .count_fn = count_bio_bvecs,
2314 .copy_fn = copy_bio_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002315 };
2316
2317 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2318 &fctx);
2319}
2320
2321static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
2322 u64 off, u64 len, struct bio *bio)
2323{
2324 struct ceph_file_extent ex = { off, len };
2325 struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
2326
2327 return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
2328}
2329
2330static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
2331{
2332 struct rbd_obj_request *obj_req =
2333 container_of(ex, struct rbd_obj_request, ex);
2334 struct ceph_bvec_iter *it = arg;
2335
2336 obj_req->bvec_pos = *it;
2337 ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
2338 ceph_bvec_iter_advance(it, bytes);
2339}
2340
Ilya Dryomovafb97882018-02-06 19:26:35 +01002341static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2342{
2343 struct rbd_obj_request *obj_req =
2344 container_of(ex, struct rbd_obj_request, ex);
2345 struct ceph_bvec_iter *it = arg;
2346
2347 ceph_bvec_iter_advance_step(it, bytes, ({
2348 obj_req->bvec_count++;
2349 }));
2350}
2351
2352static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
2353{
2354 struct rbd_obj_request *obj_req =
2355 container_of(ex, struct rbd_obj_request, ex);
2356 struct ceph_bvec_iter *it = arg;
2357
2358 ceph_bvec_iter_advance_step(it, bytes, ({
2359 obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
2360 obj_req->bvec_pos.iter.bi_size += bv.bv_len;
2361 }));
2362}
2363
Ilya Dryomov5a237812018-02-06 19:26:34 +01002364static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2365 struct ceph_file_extent *img_extents,
2366 u32 num_img_extents,
2367 struct ceph_bvec_iter *bvec_pos)
2368{
2369 struct rbd_img_fill_ctx fctx = {
2370 .pos_type = OBJ_REQUEST_BVECS,
2371 .pos = (union rbd_img_fill_iter *)bvec_pos,
2372 .set_pos_fn = set_bvec_pos,
Ilya Dryomovafb97882018-02-06 19:26:35 +01002373 .count_fn = count_bvecs,
2374 .copy_fn = copy_bvecs,
Ilya Dryomov5a237812018-02-06 19:26:34 +01002375 };
2376
2377 return rbd_img_fill_request(img_req, img_extents, num_img_extents,
2378 &fctx);
2379}
2380
2381static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
2382 struct ceph_file_extent *img_extents,
2383 u32 num_img_extents,
2384 struct bio_vec *bvecs)
2385{
2386 struct ceph_bvec_iter it = {
2387 .bvecs = bvecs,
2388 .iter = { .bi_size = ceph_file_extents_bytes(img_extents,
2389 num_img_extents) },
2390 };
2391
2392 return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
2393 &it);
2394}
2395
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002396static void rbd_img_handle_request_work(struct work_struct *work)
Alex Elderbf0d5f502012-11-22 00:00:08 -06002397{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002398 struct rbd_img_request *img_req =
2399 container_of(work, struct rbd_img_request, work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002400
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002401 rbd_img_handle_request(img_req, img_req->work_result);
2402}
Alex Elderbf0d5f502012-11-22 00:00:08 -06002403
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002404static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
2405{
2406 INIT_WORK(&img_req->work, rbd_img_handle_request_work);
2407 img_req->work_result = result;
2408 queue_work(rbd_wq, &img_req->work);
Alex Elderbf0d5f502012-11-22 00:00:08 -06002409}
2410
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002411static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
2412{
2413 rbd_obj_request_submit(obj_req);
2414 return 0;
2415}
2416
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002417static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
Alex Elder8b3e1a52013-01-24 16:13:36 -06002418{
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002419 struct rbd_img_request *img_req = obj_req->img_request;
2420 struct rbd_img_request *child_img_req;
2421 int ret;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002422
Ilya Dryomove93aca02018-02-06 19:26:35 +01002423 child_img_req = rbd_img_request_create(img_req->rbd_dev->parent,
2424 OBJ_OP_READ, NULL);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002425 if (!child_img_req)
2426 return -ENOMEM;
Alex Elder8b3e1a52013-01-24 16:13:36 -06002427
Ilya Dryomove93aca02018-02-06 19:26:35 +01002428 __set_bit(IMG_REQ_CHILD, &child_img_req->flags);
2429 child_img_req->obj_request = obj_req;
Alex Elder02c74fb2013-05-06 17:40:33 -05002430
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002431 if (!rbd_img_is_write(img_req)) {
Ilya Dryomovecc633c2018-02-01 11:50:47 +01002432 switch (img_req->data_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002433 case OBJ_REQUEST_BIO:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002434 ret = __rbd_img_fill_from_bio(child_img_req,
2435 obj_req->img_extents,
2436 obj_req->num_img_extents,
2437 &obj_req->bio_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002438 break;
2439 case OBJ_REQUEST_BVECS:
Ilya Dryomovafb97882018-02-06 19:26:35 +01002440 case OBJ_REQUEST_OWN_BVECS:
Ilya Dryomov5a237812018-02-06 19:26:34 +01002441 ret = __rbd_img_fill_from_bvecs(child_img_req,
2442 obj_req->img_extents,
2443 obj_req->num_img_extents,
2444 &obj_req->bvec_pos);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002445 break;
2446 default:
Arnd Bergmannd342a152019-03-22 15:36:37 +01002447 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002448 }
2449 } else {
Ilya Dryomov5a237812018-02-06 19:26:34 +01002450 ret = rbd_img_fill_from_bvecs(child_img_req,
2451 obj_req->img_extents,
2452 obj_req->num_img_extents,
2453 obj_req->copyup_bvecs);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002454 }
2455 if (ret) {
2456 rbd_img_request_put(child_img_req);
2457 return ret;
2458 }
2459
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002460 /* avoid parent chain recursion */
2461 rbd_img_schedule(child_img_req, 0);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002462 return 0;
2463}
2464
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002465static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002466{
2467 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
2468 int ret;
2469
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002470 switch (obj_req->read_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002471 case RBD_OBJ_READ_START:
2472 rbd_assert(!*result);
2473
2474 ret = rbd_obj_read_object(obj_req);
2475 if (ret) {
2476 *result = ret;
2477 return true;
2478 }
2479 obj_req->read_state = RBD_OBJ_READ_OBJECT;
2480 return false;
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002481 case RBD_OBJ_READ_OBJECT:
2482 if (*result == -ENOENT && rbd_dev->parent_overlap) {
2483 /* reverse map this object extent onto the parent */
2484 ret = rbd_obj_calc_img_extents(obj_req, false);
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002485 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002486 *result = ret;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002487 return true;
2488 }
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002489 if (obj_req->num_img_extents) {
2490 ret = rbd_obj_read_from_parent(obj_req);
2491 if (ret) {
2492 *result = ret;
2493 return true;
2494 }
2495 obj_req->read_state = RBD_OBJ_READ_PARENT;
2496 return false;
2497 }
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002498 }
Alex Elder02c74fb2013-05-06 17:40:33 -05002499
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002500 /*
2501 * -ENOENT means a hole in the image -- zero-fill the entire
2502 * length of the request. A short read also implies zero-fill
2503 * to the end of the request.
2504 */
2505 if (*result == -ENOENT) {
2506 rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
2507 *result = 0;
2508 } else if (*result >= 0) {
2509 if (*result < obj_req->ex.oe_len)
2510 rbd_obj_zero_range(obj_req, *result,
2511 obj_req->ex.oe_len - *result);
2512 else
2513 rbd_assert(*result == obj_req->ex.oe_len);
2514 *result = 0;
2515 }
2516 return true;
2517 case RBD_OBJ_READ_PARENT:
2518 return true;
2519 default:
2520 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002521 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002522}
2523
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002524static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
2525{
2526 rbd_obj_request_submit(obj_req);
2527 return 0;
2528}
2529
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002530/*
2531 * copyup_bvecs pages are never highmem pages
2532 */
2533static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
2534{
2535 struct ceph_bvec_iter it = {
2536 .bvecs = bvecs,
2537 .iter = { .bi_size = bytes },
2538 };
2539
2540 ceph_bvec_iter_advance_step(&it, bytes, ({
2541 if (memchr_inv(page_address(bv.bv_page) + bv.bv_offset, 0,
2542 bv.bv_len))
2543 return false;
2544 }));
2545 return true;
2546}
2547
Ilya Dryomov3a482502019-02-28 10:49:12 +01002548#define MODS_ONLY U32_MAX
2549
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002550static int rbd_obj_issue_copyup_empty_snapc(struct rbd_obj_request *obj_req,
2551 u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002552{
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002553 struct ceph_osd_request *osd_req;
Chengguang Xufe943d52018-04-12 12:04:55 +08002554 int ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002555
2556 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002557 rbd_assert(bytes > 0 && bytes != MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002558
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002559 osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
2560 if (IS_ERR(osd_req))
2561 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002562
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002563 ret = osd_req_op_cls_init(osd_req, 0, "rbd", "copyup");
Chengguang Xufe943d52018-04-12 12:04:55 +08002564 if (ret)
2565 return ret;
2566
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002567 osd_req_op_cls_request_data_bvecs(osd_req, 0,
Ilya Dryomov0010f702018-05-04 16:57:30 +02002568 obj_req->copyup_bvecs,
2569 obj_req->copyup_bvec_count,
2570 bytes);
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002571 rbd_osd_format_write(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002572
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002573 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002574 if (ret)
2575 return ret;
2576
2577 rbd_obj_request_submit(obj_req);
2578 return 0;
2579}
2580
Ilya Dryomov3a482502019-02-28 10:49:12 +01002581static int rbd_obj_issue_copyup_ops(struct rbd_obj_request *obj_req, u32 bytes)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002582{
Ilya Dryomov13488d52019-02-25 12:37:50 +01002583 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002584 struct ceph_osd_request *osd_req;
Ilya Dryomov3a482502019-02-28 10:49:12 +01002585 unsigned int num_osd_ops = (bytes != MODS_ONLY);
2586 unsigned int which = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002587 int ret;
2588
2589 dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002590
Ilya Dryomov13488d52019-02-25 12:37:50 +01002591 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002592 case OBJ_OP_WRITE:
Ilya Dryomov13488d52019-02-25 12:37:50 +01002593 num_osd_ops += count_write_ops(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002594 break;
Ilya Dryomov13488d52019-02-25 12:37:50 +01002595 case OBJ_OP_ZEROOUT:
2596 num_osd_ops += count_zeroout_ops(obj_req);
2597 break;
2598 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002599 BUG();
Ilya Dryomov13488d52019-02-25 12:37:50 +01002600 }
2601
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002602 osd_req = rbd_obj_add_osd_request(obj_req, num_osd_ops);
2603 if (IS_ERR(osd_req))
2604 return PTR_ERR(osd_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002605
Ilya Dryomov3a482502019-02-28 10:49:12 +01002606 if (bytes != MODS_ONLY) {
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002607 ret = osd_req_op_cls_init(osd_req, which, "rbd",
Ilya Dryomov3a482502019-02-28 10:49:12 +01002608 "copyup");
2609 if (ret)
2610 return ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002611
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002612 osd_req_op_cls_request_data_bvecs(osd_req, which++,
Ilya Dryomov3a482502019-02-28 10:49:12 +01002613 obj_req->copyup_bvecs,
2614 obj_req->copyup_bvec_count,
2615 bytes);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002616 }
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002617
Ilya Dryomov13488d52019-02-25 12:37:50 +01002618 switch (img_req->op_type) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002619 case OBJ_OP_WRITE:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002620 __rbd_osd_setup_write_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002621 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01002622 case OBJ_OP_ZEROOUT:
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002623 __rbd_osd_setup_zeroout_ops(osd_req, which);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002624 break;
2625 default:
Arnd Bergmann16809372019-03-22 17:53:56 +01002626 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002627 }
2628
Ilya Dryomovbcbab1d2019-05-27 11:41:36 +02002629 ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
Ilya Dryomov26f887e2018-10-15 16:11:37 +02002630 if (ret)
2631 return ret;
2632
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002633 rbd_obj_request_submit(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002634 return 0;
2635}
2636
Ilya Dryomov3a482502019-02-28 10:49:12 +01002637static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
2638{
2639 /*
2640 * Only send non-zero copyup data to save some I/O and network
2641 * bandwidth -- zero copyup data is equivalent to the object not
2642 * existing.
2643 */
2644 if (is_zero_bvecs(obj_req->copyup_bvecs, bytes)) {
2645 dout("%s obj_req %p detected zeroes\n", __func__, obj_req);
2646 bytes = 0;
2647 }
2648
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002649 if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
2650 /*
2651 * Send a copyup request with an empty snapshot context to
2652 * deep-copyup the object through all existing snapshots.
2653 * A second request with the current snapshot context will be
2654 * sent for the actual modification.
2655 */
2656 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC;
2657 return rbd_obj_issue_copyup_empty_snapc(obj_req, bytes);
2658 }
2659
Ilya Dryomov3a482502019-02-28 10:49:12 +01002660 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2661 return rbd_obj_issue_copyup_ops(obj_req, bytes);
2662}
2663
Ilya Dryomov7e07efb2018-01-20 10:30:11 +01002664static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
2665{
2666 u32 i;
2667
2668 rbd_assert(!obj_req->copyup_bvecs);
2669 obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
2670 obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
2671 sizeof(*obj_req->copyup_bvecs),
2672 GFP_NOIO);
2673 if (!obj_req->copyup_bvecs)
2674 return -ENOMEM;
2675
2676 for (i = 0; i < obj_req->copyup_bvec_count; i++) {
2677 unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
2678
2679 obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
2680 if (!obj_req->copyup_bvecs[i].bv_page)
2681 return -ENOMEM;
2682
2683 obj_req->copyup_bvecs[i].bv_offset = 0;
2684 obj_req->copyup_bvecs[i].bv_len = len;
2685 obj_overlap -= len;
2686 }
2687
2688 rbd_assert(!obj_overlap);
2689 return 0;
2690}
2691
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002692/*
2693 * The target object doesn't exist. Read the data for the entire
2694 * target object up to the overlap point (if any) from the parent,
2695 * so we can use it for a copyup.
2696 */
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002697static int rbd_obj_handle_write_guard(struct rbd_obj_request *obj_req)
2698{
2699 struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002700 int ret;
2701
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002702 rbd_assert(obj_req->num_img_extents);
2703 prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
2704 rbd_dev->parent_overlap);
2705 if (!obj_req->num_img_extents) {
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002706 /*
2707 * The overlap has become 0 (most likely because the
Ilya Dryomov3a482502019-02-28 10:49:12 +01002708 * image has been flattened). Re-submit the original write
2709 * request -- pass MODS_ONLY since the copyup isn't needed
2710 * anymore.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002711 */
Ilya Dryomov3a482502019-02-28 10:49:12 +01002712 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2713 return rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002714 }
2715
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002716 ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002717 if (ret)
2718 return ret;
2719
Ilya Dryomov3a482502019-02-28 10:49:12 +01002720 obj_req->write_state = RBD_OBJ_WRITE_READ_FROM_PARENT;
Ilya Dryomov86bd7992018-02-06 19:26:33 +01002721 return rbd_obj_read_from_parent(obj_req);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002722}
2723
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002724static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002725{
2726 int ret;
2727
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002728 switch (obj_req->write_state) {
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002729 case RBD_OBJ_WRITE_START:
2730 rbd_assert(!*result);
2731
2732 ret = rbd_obj_write_object(obj_req);
2733 if (ret) {
2734 *result = ret;
2735 return true;
2736 }
2737 obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
2738 return false;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002739 case RBD_OBJ_WRITE_OBJECT:
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002740 if (*result == -ENOENT) {
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002741 if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
2742 ret = rbd_obj_handle_write_guard(obj_req);
2743 if (ret) {
2744 *result = ret;
2745 return true;
2746 }
2747 return false;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002748 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002749 /*
2750 * On a non-existent object:
2751 * delete - -ENOENT, truncate/zero - 0
2752 */
2753 if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
2754 *result = 0;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002755 }
2756 /* fall through */
Ilya Dryomov3a482502019-02-28 10:49:12 +01002757 case RBD_OBJ_WRITE_COPYUP_OPS:
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002758 return true;
Ilya Dryomov3a482502019-02-28 10:49:12 +01002759 case RBD_OBJ_WRITE_READ_FROM_PARENT:
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002760 if (*result)
Ilya Dryomov3a482502019-02-28 10:49:12 +01002761 return true;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002762
Ilya Dryomova9b67e62019-05-08 13:35:57 +02002763 ret = rbd_obj_issue_copyup(obj_req,
2764 rbd_obj_img_extents_bytes(obj_req));
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002765 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002766 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002767 return true;
2768 }
2769 return false;
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002770 case RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC:
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002771 if (*result)
Ilya Dryomov89a59c12019-02-28 14:20:28 +01002772 return true;
2773
2774 obj_req->write_state = RBD_OBJ_WRITE_COPYUP_OPS;
2775 ret = rbd_obj_issue_copyup_ops(obj_req, MODS_ONLY);
2776 if (ret) {
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002777 *result = ret;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002778 return true;
2779 }
2780 return false;
2781 default:
Arnd Bergmannc6244b32018-04-04 14:53:39 +02002782 BUG();
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002783 }
2784}
2785
2786/*
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002787 * Return true if @obj_req is completed.
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002788 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002789static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
2790 int *result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002791{
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002792 struct rbd_img_request *img_req = obj_req->img_request;
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002793 struct rbd_device *rbd_dev = img_req->rbd_dev;
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002794 bool done;
2795
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002796 mutex_lock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002797 if (!rbd_img_is_write(img_req))
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002798 done = rbd_obj_advance_read(obj_req, result);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002799 else
Ilya Dryomov85b5e6d2019-05-14 21:06:07 +02002800 done = rbd_obj_advance_write(obj_req, result);
2801 mutex_unlock(&obj_req->state_mutex);
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002802
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002803 if (done && *result) {
2804 rbd_assert(*result < 0);
2805 rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
2806 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
2807 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
2808 }
Ilya Dryomov0ad5d952019-05-14 20:45:38 +02002809 return done;
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002810}
2811
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002812/*
2813 * This is open-coded in rbd_img_handle_request() to avoid parent chain
2814 * recursion.
2815 */
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002816static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002817{
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002818 if (__rbd_obj_handle_request(obj_req, &result))
2819 rbd_img_handle_request(obj_req->img_request, result);
2820}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002821
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002822static void rbd_img_object_requests(struct rbd_img_request *img_req)
2823{
2824 struct rbd_obj_request *obj_req;
2825
2826 rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
2827
2828 for_each_obj_request(img_req, obj_req) {
2829 int result = 0;
2830
2831 if (__rbd_obj_handle_request(obj_req, &result)) {
2832 if (result) {
2833 img_req->pending.result = result;
2834 return;
2835 }
2836 } else {
2837 img_req->pending.num_pending++;
2838 }
2839 }
2840}
2841
2842static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
2843{
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002844again:
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002845 switch (img_req->state) {
2846 case RBD_IMG_START:
2847 rbd_assert(!*result);
Ilya Dryomov3da691b2018-01-29 14:04:08 +01002848
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002849 rbd_img_object_requests(img_req);
2850 if (!img_req->pending.num_pending) {
2851 *result = img_req->pending.result;
2852 img_req->state = RBD_IMG_OBJECT_REQUESTS;
2853 goto again;
2854 }
2855 img_req->state = __RBD_IMG_OBJECT_REQUESTS;
2856 return false;
2857 case __RBD_IMG_OBJECT_REQUESTS:
2858 if (!pending_result_dec(&img_req->pending, result))
2859 return false;
2860 /* fall through */
2861 case RBD_IMG_OBJECT_REQUESTS:
2862 return true;
2863 default:
2864 BUG();
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002865 }
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002866}
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002867
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002868/*
2869 * Return true if @img_req is completed.
2870 */
2871static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
2872 int *result)
2873{
2874 struct rbd_device *rbd_dev = img_req->rbd_dev;
2875 bool done;
2876
2877 mutex_lock(&img_req->state_mutex);
2878 done = rbd_img_advance(img_req, result);
2879 mutex_unlock(&img_req->state_mutex);
2880
2881 if (done && *result) {
2882 rbd_assert(*result < 0);
2883 rbd_warn(rbd_dev, "%s%s result %d",
2884 test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
2885 obj_op_name(img_req->op_type), *result);
2886 }
2887 return done;
2888}
2889
2890static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
2891{
2892again:
2893 if (!__rbd_img_handle_request(img_req, &result))
2894 return;
2895
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002896 if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002897 struct rbd_obj_request *obj_req = img_req->obj_request;
2898
Ilya Dryomov54ab3b22019-05-11 16:21:49 +02002899 rbd_img_request_put(img_req);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02002900 if (__rbd_obj_handle_request(obj_req, &result)) {
2901 img_req = obj_req->img_request;
2902 goto again;
2903 }
2904 } else {
2905 struct request *rq = img_req->rq;
2906
2907 rbd_img_request_put(img_req);
2908 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomov7114eda2018-02-01 11:50:47 +01002909 }
Alex Elder8b3e1a52013-01-24 16:13:36 -06002910}
2911
Ilya Dryomoved95b212016-08-12 16:40:02 +02002912static const struct rbd_client_id rbd_empty_cid;
2913
2914static bool rbd_cid_equal(const struct rbd_client_id *lhs,
2915 const struct rbd_client_id *rhs)
2916{
2917 return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
2918}
2919
2920static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
2921{
2922 struct rbd_client_id cid;
2923
2924 mutex_lock(&rbd_dev->watch_mutex);
2925 cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
2926 cid.handle = rbd_dev->watch_cookie;
2927 mutex_unlock(&rbd_dev->watch_mutex);
2928 return cid;
2929}
2930
2931/*
2932 * lock_rwsem must be held for write
2933 */
2934static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
2935 const struct rbd_client_id *cid)
2936{
2937 dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
2938 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
2939 cid->gid, cid->handle);
2940 rbd_dev->owner_cid = *cid; /* struct */
2941}
2942
2943static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
2944{
2945 mutex_lock(&rbd_dev->watch_mutex);
2946 sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
2947 mutex_unlock(&rbd_dev->watch_mutex);
2948}
2949
Florian Margaineedd8ca82017-12-13 16:43:59 +01002950static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
2951{
2952 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
2953
2954 strcpy(rbd_dev->lock_cookie, cookie);
2955 rbd_set_owner_cid(rbd_dev, &cid);
2956 queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
2957}
2958
Ilya Dryomoved95b212016-08-12 16:40:02 +02002959/*
2960 * lock_rwsem must be held for write
2961 */
2962static int rbd_lock(struct rbd_device *rbd_dev)
2963{
2964 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002965 char cookie[32];
2966 int ret;
2967
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002968 WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
2969 rbd_dev->lock_cookie[0] != '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002970
2971 format_lock_cookie(rbd_dev, cookie);
2972 ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
2973 RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
2974 RBD_LOCK_TAG, "", 0);
2975 if (ret)
2976 return ret;
2977
2978 rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
Florian Margaineedd8ca82017-12-13 16:43:59 +01002979 __rbd_lock(rbd_dev, cookie);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002980 return 0;
2981}
2982
2983/*
2984 * lock_rwsem must be held for write
2985 */
Ilya Dryomovbbead742017-04-13 12:17:38 +02002986static void rbd_unlock(struct rbd_device *rbd_dev)
Ilya Dryomoved95b212016-08-12 16:40:02 +02002987{
2988 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomoved95b212016-08-12 16:40:02 +02002989 int ret;
2990
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002991 WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
2992 rbd_dev->lock_cookie[0] == '\0');
Ilya Dryomoved95b212016-08-12 16:40:02 +02002993
Ilya Dryomoved95b212016-08-12 16:40:02 +02002994 ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02002995 RBD_LOCK_NAME, rbd_dev->lock_cookie);
Ilya Dryomovbbead742017-04-13 12:17:38 +02002996 if (ret && ret != -ENOENT)
2997 rbd_warn(rbd_dev, "failed to unlock: %d", ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02002998
Ilya Dryomovbbead742017-04-13 12:17:38 +02002999 /* treat errors as the image is unlocked */
3000 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
Ilya Dryomovcbbfb0f2017-04-13 12:17:38 +02003001 rbd_dev->lock_cookie[0] = '\0';
Ilya Dryomoved95b212016-08-12 16:40:02 +02003002 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3003 queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003004}
3005
3006static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
3007 enum rbd_notify_op notify_op,
3008 struct page ***preply_pages,
3009 size_t *preply_len)
3010{
3011 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3012 struct rbd_client_id cid = rbd_get_cid(rbd_dev);
Kyle Spiers08a79102018-03-17 09:44:01 -07003013 char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
3014 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003015 void *p = buf;
3016
3017 dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
3018
3019 /* encode *LockPayload NotifyMessage (op + ClientId) */
3020 ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
3021 ceph_encode_32(&p, notify_op);
3022 ceph_encode_64(&p, cid.gid);
3023 ceph_encode_64(&p, cid.handle);
3024
3025 return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
3026 &rbd_dev->header_oloc, buf, buf_size,
3027 RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
3028}
3029
3030static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
3031 enum rbd_notify_op notify_op)
3032{
3033 struct page **reply_pages;
3034 size_t reply_len;
3035
3036 __rbd_notify_op_lock(rbd_dev, notify_op, &reply_pages, &reply_len);
3037 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3038}
3039
3040static void rbd_notify_acquired_lock(struct work_struct *work)
3041{
3042 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3043 acquired_lock_work);
3044
3045 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
3046}
3047
3048static void rbd_notify_released_lock(struct work_struct *work)
3049{
3050 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3051 released_lock_work);
3052
3053 rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
3054}
3055
3056static int rbd_request_lock(struct rbd_device *rbd_dev)
3057{
3058 struct page **reply_pages;
3059 size_t reply_len;
3060 bool lock_owner_responded = false;
3061 int ret;
3062
3063 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3064
3065 ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
3066 &reply_pages, &reply_len);
3067 if (ret && ret != -ETIMEDOUT) {
3068 rbd_warn(rbd_dev, "failed to request lock: %d", ret);
3069 goto out;
3070 }
3071
3072 if (reply_len > 0 && reply_len <= PAGE_SIZE) {
3073 void *p = page_address(reply_pages[0]);
3074 void *const end = p + reply_len;
3075 u32 n;
3076
3077 ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
3078 while (n--) {
3079 u8 struct_v;
3080 u32 len;
3081
3082 ceph_decode_need(&p, end, 8 + 8, e_inval);
3083 p += 8 + 8; /* skip gid and cookie */
3084
3085 ceph_decode_32_safe(&p, end, len, e_inval);
3086 if (!len)
3087 continue;
3088
3089 if (lock_owner_responded) {
3090 rbd_warn(rbd_dev,
3091 "duplicate lock owners detected");
3092 ret = -EIO;
3093 goto out;
3094 }
3095
3096 lock_owner_responded = true;
3097 ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
3098 &struct_v, &len);
3099 if (ret) {
3100 rbd_warn(rbd_dev,
3101 "failed to decode ResponseMessage: %d",
3102 ret);
3103 goto e_inval;
3104 }
3105
3106 ret = ceph_decode_32(&p);
3107 }
3108 }
3109
3110 if (!lock_owner_responded) {
3111 rbd_warn(rbd_dev, "no lock owners detected");
3112 ret = -ETIMEDOUT;
3113 }
3114
3115out:
3116 ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
3117 return ret;
3118
3119e_inval:
3120 ret = -EINVAL;
3121 goto out;
3122}
3123
3124static void wake_requests(struct rbd_device *rbd_dev, bool wake_all)
3125{
3126 dout("%s rbd_dev %p wake_all %d\n", __func__, rbd_dev, wake_all);
3127
3128 cancel_delayed_work(&rbd_dev->lock_dwork);
3129 if (wake_all)
3130 wake_up_all(&rbd_dev->lock_waitq);
3131 else
3132 wake_up(&rbd_dev->lock_waitq);
3133}
3134
3135static int get_lock_owner_info(struct rbd_device *rbd_dev,
3136 struct ceph_locker **lockers, u32 *num_lockers)
3137{
3138 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3139 u8 lock_type;
3140 char *lock_tag;
3141 int ret;
3142
3143 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3144
3145 ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
3146 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3147 &lock_type, &lock_tag, lockers, num_lockers);
3148 if (ret)
3149 return ret;
3150
3151 if (*num_lockers == 0) {
3152 dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
3153 goto out;
3154 }
3155
3156 if (strcmp(lock_tag, RBD_LOCK_TAG)) {
3157 rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
3158 lock_tag);
3159 ret = -EBUSY;
3160 goto out;
3161 }
3162
3163 if (lock_type == CEPH_CLS_LOCK_SHARED) {
3164 rbd_warn(rbd_dev, "shared lock type detected");
3165 ret = -EBUSY;
3166 goto out;
3167 }
3168
3169 if (strncmp((*lockers)[0].id.cookie, RBD_LOCK_COOKIE_PREFIX,
3170 strlen(RBD_LOCK_COOKIE_PREFIX))) {
3171 rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
3172 (*lockers)[0].id.cookie);
3173 ret = -EBUSY;
3174 goto out;
3175 }
3176
3177out:
3178 kfree(lock_tag);
3179 return ret;
3180}
3181
3182static int find_watcher(struct rbd_device *rbd_dev,
3183 const struct ceph_locker *locker)
3184{
3185 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3186 struct ceph_watch_item *watchers;
3187 u32 num_watchers;
3188 u64 cookie;
3189 int i;
3190 int ret;
3191
3192 ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
3193 &rbd_dev->header_oloc, &watchers,
3194 &num_watchers);
3195 if (ret)
3196 return ret;
3197
3198 sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
3199 for (i = 0; i < num_watchers; i++) {
3200 if (!memcmp(&watchers[i].addr, &locker->info.addr,
3201 sizeof(locker->info.addr)) &&
3202 watchers[i].cookie == cookie) {
3203 struct rbd_client_id cid = {
3204 .gid = le64_to_cpu(watchers[i].name.num),
3205 .handle = cookie,
3206 };
3207
3208 dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
3209 rbd_dev, cid.gid, cid.handle);
3210 rbd_set_owner_cid(rbd_dev, &cid);
3211 ret = 1;
3212 goto out;
3213 }
3214 }
3215
3216 dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
3217 ret = 0;
3218out:
3219 kfree(watchers);
3220 return ret;
3221}
3222
3223/*
3224 * lock_rwsem must be held for write
3225 */
3226static int rbd_try_lock(struct rbd_device *rbd_dev)
3227{
3228 struct ceph_client *client = rbd_dev->rbd_client->client;
3229 struct ceph_locker *lockers;
3230 u32 num_lockers;
3231 int ret;
3232
3233 for (;;) {
3234 ret = rbd_lock(rbd_dev);
3235 if (ret != -EBUSY)
3236 return ret;
3237
3238 /* determine if the current lock holder is still alive */
3239 ret = get_lock_owner_info(rbd_dev, &lockers, &num_lockers);
3240 if (ret)
3241 return ret;
3242
3243 if (num_lockers == 0)
3244 goto again;
3245
3246 ret = find_watcher(rbd_dev, lockers);
3247 if (ret) {
3248 if (ret > 0)
3249 ret = 0; /* have to request lock */
3250 goto out;
3251 }
3252
3253 rbd_warn(rbd_dev, "%s%llu seems dead, breaking lock",
3254 ENTITY_NAME(lockers[0].id.name));
3255
3256 ret = ceph_monc_blacklist_add(&client->monc,
3257 &lockers[0].info.addr);
3258 if (ret) {
3259 rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d",
3260 ENTITY_NAME(lockers[0].id.name), ret);
3261 goto out;
3262 }
3263
3264 ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
3265 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3266 lockers[0].id.cookie,
3267 &lockers[0].id.name);
3268 if (ret && ret != -ENOENT)
3269 goto out;
3270
3271again:
3272 ceph_free_lockers(lockers, num_lockers);
3273 }
3274
3275out:
3276 ceph_free_lockers(lockers, num_lockers);
3277 return ret;
3278}
3279
3280/*
3281 * ret is set only if lock_state is RBD_LOCK_STATE_UNLOCKED
3282 */
3283static enum rbd_lock_state rbd_try_acquire_lock(struct rbd_device *rbd_dev,
3284 int *pret)
3285{
3286 enum rbd_lock_state lock_state;
3287
3288 down_read(&rbd_dev->lock_rwsem);
3289 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3290 rbd_dev->lock_state);
3291 if (__rbd_is_lock_owner(rbd_dev)) {
3292 lock_state = rbd_dev->lock_state;
3293 up_read(&rbd_dev->lock_rwsem);
3294 return lock_state;
3295 }
3296
3297 up_read(&rbd_dev->lock_rwsem);
3298 down_write(&rbd_dev->lock_rwsem);
3299 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3300 rbd_dev->lock_state);
3301 if (!__rbd_is_lock_owner(rbd_dev)) {
3302 *pret = rbd_try_lock(rbd_dev);
3303 if (*pret)
3304 rbd_warn(rbd_dev, "failed to acquire lock: %d", *pret);
3305 }
3306
3307 lock_state = rbd_dev->lock_state;
3308 up_write(&rbd_dev->lock_rwsem);
3309 return lock_state;
3310}
3311
3312static void rbd_acquire_lock(struct work_struct *work)
3313{
3314 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3315 struct rbd_device, lock_dwork);
3316 enum rbd_lock_state lock_state;
Kefeng Wang37f13252017-07-13 15:46:35 +08003317 int ret = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003318
3319 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3320again:
3321 lock_state = rbd_try_acquire_lock(rbd_dev, &ret);
3322 if (lock_state != RBD_LOCK_STATE_UNLOCKED || ret == -EBLACKLISTED) {
3323 if (lock_state == RBD_LOCK_STATE_LOCKED)
3324 wake_requests(rbd_dev, true);
3325 dout("%s rbd_dev %p lock_state %d ret %d - done\n", __func__,
3326 rbd_dev, lock_state, ret);
3327 return;
3328 }
3329
3330 ret = rbd_request_lock(rbd_dev);
3331 if (ret == -ETIMEDOUT) {
3332 goto again; /* treat this as a dead client */
Ilya Dryomove010dd02017-04-13 12:17:39 +02003333 } else if (ret == -EROFS) {
3334 rbd_warn(rbd_dev, "peer will not release lock");
3335 /*
3336 * If this is rbd_add_acquire_lock(), we want to fail
3337 * immediately -- reuse BLACKLISTED flag. Otherwise we
3338 * want to block.
3339 */
3340 if (!(rbd_dev->disk->flags & GENHD_FL_UP)) {
3341 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
3342 /* wake "rbd map --exclusive" process */
3343 wake_requests(rbd_dev, false);
3344 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003345 } else if (ret < 0) {
3346 rbd_warn(rbd_dev, "error requesting lock: %d", ret);
3347 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3348 RBD_RETRY_DELAY);
3349 } else {
3350 /*
3351 * lock owner acked, but resend if we don't see them
3352 * release the lock
3353 */
3354 dout("%s rbd_dev %p requeueing lock_dwork\n", __func__,
3355 rbd_dev);
3356 mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
3357 msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
3358 }
3359}
3360
3361/*
3362 * lock_rwsem must be held for write
3363 */
3364static bool rbd_release_lock(struct rbd_device *rbd_dev)
3365{
3366 dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
3367 rbd_dev->lock_state);
3368 if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
3369 return false;
3370
3371 rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
3372 downgrade_write(&rbd_dev->lock_rwsem);
3373 /*
3374 * Ensure that all in-flight IO is flushed.
3375 *
3376 * FIXME: ceph_osdc_sync() flushes the entire OSD client, which
3377 * may be shared with other devices.
3378 */
3379 ceph_osdc_sync(&rbd_dev->rbd_client->client->osdc);
3380 up_read(&rbd_dev->lock_rwsem);
3381
3382 down_write(&rbd_dev->lock_rwsem);
3383 dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
3384 rbd_dev->lock_state);
3385 if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
3386 return false;
3387
Ilya Dryomovbbead742017-04-13 12:17:38 +02003388 rbd_unlock(rbd_dev);
3389 /*
3390 * Give others a chance to grab the lock - we would re-acquire
3391 * almost immediately if we got new IO during ceph_osdc_sync()
3392 * otherwise. We need to ack our own notifications, so this
3393 * lock_dwork will be requeued from rbd_wait_state_locked()
3394 * after wake_requests() in rbd_handle_released_lock().
3395 */
3396 cancel_delayed_work(&rbd_dev->lock_dwork);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003397 return true;
3398}
3399
3400static void rbd_release_lock_work(struct work_struct *work)
3401{
3402 struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
3403 unlock_work);
3404
3405 down_write(&rbd_dev->lock_rwsem);
3406 rbd_release_lock(rbd_dev);
3407 up_write(&rbd_dev->lock_rwsem);
3408}
3409
3410static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
3411 void **p)
3412{
3413 struct rbd_client_id cid = { 0 };
3414
3415 if (struct_v >= 2) {
3416 cid.gid = ceph_decode_64(p);
3417 cid.handle = ceph_decode_64(p);
3418 }
3419
3420 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3421 cid.handle);
3422 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3423 down_write(&rbd_dev->lock_rwsem);
3424 if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3425 /*
3426 * we already know that the remote client is
3427 * the owner
3428 */
3429 up_write(&rbd_dev->lock_rwsem);
3430 return;
3431 }
3432
3433 rbd_set_owner_cid(rbd_dev, &cid);
3434 downgrade_write(&rbd_dev->lock_rwsem);
3435 } else {
3436 down_read(&rbd_dev->lock_rwsem);
3437 }
3438
3439 if (!__rbd_is_lock_owner(rbd_dev))
3440 wake_requests(rbd_dev, false);
3441 up_read(&rbd_dev->lock_rwsem);
3442}
3443
3444static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
3445 void **p)
3446{
3447 struct rbd_client_id cid = { 0 };
3448
3449 if (struct_v >= 2) {
3450 cid.gid = ceph_decode_64(p);
3451 cid.handle = ceph_decode_64(p);
3452 }
3453
3454 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3455 cid.handle);
3456 if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
3457 down_write(&rbd_dev->lock_rwsem);
3458 if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
3459 dout("%s rbd_dev %p unexpected owner, cid %llu-%llu != owner_cid %llu-%llu\n",
3460 __func__, rbd_dev, cid.gid, cid.handle,
3461 rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
3462 up_write(&rbd_dev->lock_rwsem);
3463 return;
3464 }
3465
3466 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3467 downgrade_write(&rbd_dev->lock_rwsem);
3468 } else {
3469 down_read(&rbd_dev->lock_rwsem);
3470 }
3471
3472 if (!__rbd_is_lock_owner(rbd_dev))
3473 wake_requests(rbd_dev, false);
3474 up_read(&rbd_dev->lock_rwsem);
3475}
3476
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003477/*
3478 * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
3479 * ResponseMessage is needed.
3480 */
3481static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
3482 void **p)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003483{
3484 struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
3485 struct rbd_client_id cid = { 0 };
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003486 int result = 1;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003487
3488 if (struct_v >= 2) {
3489 cid.gid = ceph_decode_64(p);
3490 cid.handle = ceph_decode_64(p);
3491 }
3492
3493 dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
3494 cid.handle);
3495 if (rbd_cid_equal(&cid, &my_cid))
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003496 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003497
3498 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003499 if (__rbd_is_lock_owner(rbd_dev)) {
3500 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
3501 rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
3502 goto out_unlock;
3503
3504 /*
3505 * encode ResponseMessage(0) so the peer can detect
3506 * a missing owner
3507 */
3508 result = 0;
3509
3510 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02003511 if (!rbd_dev->opts->exclusive) {
3512 dout("%s rbd_dev %p queueing unlock_work\n",
3513 __func__, rbd_dev);
3514 queue_work(rbd_dev->task_wq,
3515 &rbd_dev->unlock_work);
3516 } else {
3517 /* refuse to release the lock */
3518 result = -EROFS;
3519 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003520 }
3521 }
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003522
3523out_unlock:
Ilya Dryomoved95b212016-08-12 16:40:02 +02003524 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003525 return result;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003526}
3527
3528static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
3529 u64 notify_id, u64 cookie, s32 *result)
3530{
3531 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Kyle Spiers08a79102018-03-17 09:44:01 -07003532 char buf[4 + CEPH_ENCODING_START_BLK_LEN];
3533 int buf_size = sizeof(buf);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003534 int ret;
3535
3536 if (result) {
3537 void *p = buf;
3538
3539 /* encode ResponseMessage */
3540 ceph_start_encoding(&p, 1, 1,
3541 buf_size - CEPH_ENCODING_START_BLK_LEN);
3542 ceph_encode_32(&p, *result);
3543 } else {
3544 buf_size = 0;
3545 }
3546
3547 ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
3548 &rbd_dev->header_oloc, notify_id, cookie,
3549 buf, buf_size);
3550 if (ret)
3551 rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
3552}
3553
3554static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
3555 u64 cookie)
3556{
3557 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3558 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
3559}
3560
3561static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
3562 u64 notify_id, u64 cookie, s32 result)
3563{
3564 dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
3565 __rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
3566}
Ilya Dryomov922dab62016-05-26 01:15:02 +02003567
3568static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
3569 u64 notifier_id, void *data, size_t data_len)
Alex Elderb8d70032012-11-30 17:53:04 -06003570{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003571 struct rbd_device *rbd_dev = arg;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003572 void *p = data;
3573 void *const end = p + data_len;
Ilya Dryomovd4c22692016-09-06 11:15:48 +02003574 u8 struct_v = 0;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003575 u32 len;
3576 u32 notify_op;
Alex Elderb8d70032012-11-30 17:53:04 -06003577 int ret;
3578
Ilya Dryomoved95b212016-08-12 16:40:02 +02003579 dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
3580 __func__, rbd_dev, cookie, notify_id, data_len);
3581 if (data_len) {
3582 ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
3583 &struct_v, &len);
3584 if (ret) {
3585 rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
3586 ret);
3587 return;
3588 }
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04003589
Ilya Dryomoved95b212016-08-12 16:40:02 +02003590 notify_op = ceph_decode_32(&p);
3591 } else {
3592 /* legacy notification for header updates */
3593 notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
3594 len = 0;
3595 }
Alex Elderb8d70032012-11-30 17:53:04 -06003596
Ilya Dryomoved95b212016-08-12 16:40:02 +02003597 dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
3598 switch (notify_op) {
3599 case RBD_NOTIFY_OP_ACQUIRED_LOCK:
3600 rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
3601 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3602 break;
3603 case RBD_NOTIFY_OP_RELEASED_LOCK:
3604 rbd_handle_released_lock(rbd_dev, struct_v, &p);
3605 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3606 break;
3607 case RBD_NOTIFY_OP_REQUEST_LOCK:
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003608 ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
3609 if (ret <= 0)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003610 rbd_acknowledge_notify_result(rbd_dev, notify_id,
Ilya Dryomov3b77faa2017-04-13 12:17:39 +02003611 cookie, ret);
Ilya Dryomoved95b212016-08-12 16:40:02 +02003612 else
3613 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3614 break;
3615 case RBD_NOTIFY_OP_HEADER_UPDATE:
3616 ret = rbd_dev_refresh(rbd_dev);
3617 if (ret)
3618 rbd_warn(rbd_dev, "refresh failed: %d", ret);
3619
3620 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3621 break;
3622 default:
3623 if (rbd_is_lock_owner(rbd_dev))
3624 rbd_acknowledge_notify_result(rbd_dev, notify_id,
3625 cookie, -EOPNOTSUPP);
3626 else
3627 rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
3628 break;
3629 }
Alex Elderb8d70032012-11-30 17:53:04 -06003630}
3631
Ilya Dryomov99d16942016-08-12 16:11:41 +02003632static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
3633
Ilya Dryomov922dab62016-05-26 01:15:02 +02003634static void rbd_watch_errcb(void *arg, u64 cookie, int err)
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003635{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003636 struct rbd_device *rbd_dev = arg;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003637
Ilya Dryomov922dab62016-05-26 01:15:02 +02003638 rbd_warn(rbd_dev, "encountered watch error: %d", err);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003639
Ilya Dryomoved95b212016-08-12 16:40:02 +02003640 down_write(&rbd_dev->lock_rwsem);
3641 rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
3642 up_write(&rbd_dev->lock_rwsem);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003643
Ilya Dryomov99d16942016-08-12 16:11:41 +02003644 mutex_lock(&rbd_dev->watch_mutex);
3645 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
3646 __rbd_unregister_watch(rbd_dev);
3647 rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003648
Ilya Dryomov99d16942016-08-12 16:11:41 +02003649 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003650 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003651 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomovbb040aa2014-06-19 11:38:14 +04003652}
3653
3654/*
Ilya Dryomov99d16942016-08-12 16:11:41 +02003655 * watch_mutex must be locked
Alex Elder9969ebc2013-01-18 12:31:10 -06003656 */
Ilya Dryomov99d16942016-08-12 16:11:41 +02003657static int __rbd_register_watch(struct rbd_device *rbd_dev)
Alex Elder9969ebc2013-01-18 12:31:10 -06003658{
3659 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
Ilya Dryomov922dab62016-05-26 01:15:02 +02003660 struct ceph_osd_linger_request *handle;
Alex Elder9969ebc2013-01-18 12:31:10 -06003661
Ilya Dryomov922dab62016-05-26 01:15:02 +02003662 rbd_assert(!rbd_dev->watch_handle);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003663 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Alex Elder9969ebc2013-01-18 12:31:10 -06003664
Ilya Dryomov922dab62016-05-26 01:15:02 +02003665 handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
3666 &rbd_dev->header_oloc, rbd_watch_cb,
3667 rbd_watch_errcb, rbd_dev);
3668 if (IS_ERR(handle))
3669 return PTR_ERR(handle);
Alex Elder9969ebc2013-01-18 12:31:10 -06003670
Ilya Dryomov922dab62016-05-26 01:15:02 +02003671 rbd_dev->watch_handle = handle;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003672 return 0;
Alex Elder9969ebc2013-01-18 12:31:10 -06003673}
3674
Ilya Dryomov99d16942016-08-12 16:11:41 +02003675/*
3676 * watch_mutex must be locked
3677 */
3678static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
Ilya Dryomovfca27062013-12-16 18:02:40 +02003679{
Ilya Dryomov922dab62016-05-26 01:15:02 +02003680 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3681 int ret;
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003682
Ilya Dryomov99d16942016-08-12 16:11:41 +02003683 rbd_assert(rbd_dev->watch_handle);
3684 dout("%s rbd_dev %p\n", __func__, rbd_dev);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003685
Ilya Dryomov922dab62016-05-26 01:15:02 +02003686 ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
3687 if (ret)
3688 rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
Ilya Dryomovb30a01f2014-05-22 19:28:52 +04003689
Ilya Dryomov922dab62016-05-26 01:15:02 +02003690 rbd_dev->watch_handle = NULL;
Ilya Dryomovc525f032016-04-28 16:07:26 +02003691}
3692
Ilya Dryomov99d16942016-08-12 16:11:41 +02003693static int rbd_register_watch(struct rbd_device *rbd_dev)
Ilya Dryomovc525f032016-04-28 16:07:26 +02003694{
Ilya Dryomov99d16942016-08-12 16:11:41 +02003695 int ret;
Ilya Dryomov811c6682016-04-15 16:22:16 +02003696
Ilya Dryomov99d16942016-08-12 16:11:41 +02003697 mutex_lock(&rbd_dev->watch_mutex);
3698 rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
3699 ret = __rbd_register_watch(rbd_dev);
3700 if (ret)
3701 goto out;
3702
3703 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3704 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3705
3706out:
3707 mutex_unlock(&rbd_dev->watch_mutex);
3708 return ret;
3709}
3710
3711static void cancel_tasks_sync(struct rbd_device *rbd_dev)
3712{
3713 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3714
Ilya Dryomoved95b212016-08-12 16:40:02 +02003715 cancel_work_sync(&rbd_dev->acquired_lock_work);
3716 cancel_work_sync(&rbd_dev->released_lock_work);
3717 cancel_delayed_work_sync(&rbd_dev->lock_dwork);
3718 cancel_work_sync(&rbd_dev->unlock_work);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003719}
3720
3721static void rbd_unregister_watch(struct rbd_device *rbd_dev)
3722{
Ilya Dryomoved95b212016-08-12 16:40:02 +02003723 WARN_ON(waitqueue_active(&rbd_dev->lock_waitq));
Ilya Dryomov99d16942016-08-12 16:11:41 +02003724 cancel_tasks_sync(rbd_dev);
3725
3726 mutex_lock(&rbd_dev->watch_mutex);
3727 if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
3728 __rbd_unregister_watch(rbd_dev);
3729 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
3730 mutex_unlock(&rbd_dev->watch_mutex);
3731
Dongsheng Yang23edca82018-06-04 06:24:37 -04003732 cancel_delayed_work_sync(&rbd_dev->watch_dwork);
Ilya Dryomov811c6682016-04-15 16:22:16 +02003733 ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
Ilya Dryomovfca27062013-12-16 18:02:40 +02003734}
3735
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003736/*
3737 * lock_rwsem must be held for write
3738 */
3739static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
3740{
3741 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3742 char cookie[32];
3743 int ret;
3744
3745 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
3746
3747 format_lock_cookie(rbd_dev, cookie);
3748 ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
3749 &rbd_dev->header_oloc, RBD_LOCK_NAME,
3750 CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
3751 RBD_LOCK_TAG, cookie);
3752 if (ret) {
3753 if (ret != -EOPNOTSUPP)
3754 rbd_warn(rbd_dev, "failed to update lock cookie: %d",
3755 ret);
3756
3757 /*
3758 * Lock cookie cannot be updated on older OSDs, so do
3759 * a manual release and queue an acquire.
3760 */
3761 if (rbd_release_lock(rbd_dev))
3762 queue_delayed_work(rbd_dev->task_wq,
3763 &rbd_dev->lock_dwork, 0);
3764 } else {
Florian Margaineedd8ca82017-12-13 16:43:59 +01003765 __rbd_lock(rbd_dev, cookie);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003766 }
3767}
3768
Ilya Dryomov99d16942016-08-12 16:11:41 +02003769static void rbd_reregister_watch(struct work_struct *work)
3770{
3771 struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
3772 struct rbd_device, watch_dwork);
3773 int ret;
3774
3775 dout("%s rbd_dev %p\n", __func__, rbd_dev);
3776
3777 mutex_lock(&rbd_dev->watch_mutex);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003778 if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
3779 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003780 return;
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003781 }
Ilya Dryomov99d16942016-08-12 16:11:41 +02003782
3783 ret = __rbd_register_watch(rbd_dev);
3784 if (ret) {
3785 rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
Ilya Dryomov4d736442016-09-29 14:23:12 +02003786 if (ret == -EBLACKLISTED || ret == -ENOENT) {
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003787 set_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003788 wake_requests(rbd_dev, true);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003789 } else {
Ilya Dryomov99d16942016-08-12 16:11:41 +02003790 queue_delayed_work(rbd_dev->task_wq,
3791 &rbd_dev->watch_dwork,
3792 RBD_RETRY_DELAY);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003793 }
3794 mutex_unlock(&rbd_dev->watch_mutex);
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003795 return;
Ilya Dryomov99d16942016-08-12 16:11:41 +02003796 }
3797
3798 rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
3799 rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
3800 mutex_unlock(&rbd_dev->watch_mutex);
3801
Ilya Dryomov14bb2112017-04-13 12:17:38 +02003802 down_write(&rbd_dev->lock_rwsem);
3803 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3804 rbd_reacquire_lock(rbd_dev);
3805 up_write(&rbd_dev->lock_rwsem);
3806
Ilya Dryomov99d16942016-08-12 16:11:41 +02003807 ret = rbd_dev_refresh(rbd_dev);
3808 if (ret)
Colin Ian Kingf6870cc2018-03-19 13:33:10 +00003809 rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
Ilya Dryomov99d16942016-08-12 16:11:41 +02003810}
3811
Alex Elder36be9a72013-01-19 00:30:28 -06003812/*
Alex Elderf40eb342013-04-25 15:09:42 -05003813 * Synchronous osd object method call. Returns the number of bytes
3814 * returned in the outbound buffer, or a negative error code.
Alex Elder36be9a72013-01-19 00:30:28 -06003815 */
3816static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003817 struct ceph_object_id *oid,
3818 struct ceph_object_locator *oloc,
Alex Elder36be9a72013-01-19 00:30:28 -06003819 const char *method_name,
Alex Elder41579762013-04-21 12:14:45 -05003820 const void *outbound,
Alex Elder36be9a72013-01-19 00:30:28 -06003821 size_t outbound_size,
Alex Elder41579762013-04-21 12:14:45 -05003822 void *inbound,
Alex Eldere2a58ee2013-04-30 00:44:33 -05003823 size_t inbound_size)
Alex Elder36be9a72013-01-19 00:30:28 -06003824{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003825 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
3826 struct page *req_page = NULL;
3827 struct page *reply_page;
Alex Elder36be9a72013-01-19 00:30:28 -06003828 int ret;
3829
3830 /*
Alex Elder6010a452013-04-05 01:27:11 -05003831 * Method calls are ultimately read operations. The result
3832 * should placed into the inbound buffer provided. They
3833 * also supply outbound data--parameters for the object
3834 * method. Currently if this is present it will be a
3835 * snapshot id.
Alex Elder36be9a72013-01-19 00:30:28 -06003836 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003837 if (outbound) {
3838 if (outbound_size > PAGE_SIZE)
3839 return -E2BIG;
Alex Elder36be9a72013-01-19 00:30:28 -06003840
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003841 req_page = alloc_page(GFP_KERNEL);
3842 if (!req_page)
3843 return -ENOMEM;
Alex Elder36be9a72013-01-19 00:30:28 -06003844
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003845 memcpy(page_address(req_page), outbound, outbound_size);
Alex Elder04017e22013-04-05 14:46:02 -05003846 }
Alex Elder430c28c2013-04-03 21:32:51 -05003847
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003848 reply_page = alloc_page(GFP_KERNEL);
3849 if (!reply_page) {
3850 if (req_page)
3851 __free_page(req_page);
3852 return -ENOMEM;
3853 }
Alex Elder36be9a72013-01-19 00:30:28 -06003854
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003855 ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
3856 CEPH_OSD_FLAG_READ, req_page, outbound_size,
3857 reply_page, &inbound_size);
3858 if (!ret) {
3859 memcpy(inbound, page_address(reply_page), inbound_size);
3860 ret = inbound_size;
3861 }
Alex Elder57385b52013-04-21 12:14:45 -05003862
Ilya Dryomovecd4a682017-01-25 18:16:21 +01003863 if (req_page)
3864 __free_page(req_page);
3865 __free_page(reply_page);
Alex Elder36be9a72013-01-19 00:30:28 -06003866 return ret;
3867}
3868
Ilya Dryomoved95b212016-08-12 16:40:02 +02003869/*
3870 * lock_rwsem must be held for read
3871 */
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003872static int rbd_wait_state_locked(struct rbd_device *rbd_dev, bool may_acquire)
Ilya Dryomoved95b212016-08-12 16:40:02 +02003873{
3874 DEFINE_WAIT(wait);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003875 unsigned long timeout;
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003876 int ret = 0;
3877
3878 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags))
3879 return -EBLACKLISTED;
3880
3881 if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
3882 return 0;
3883
3884 if (!may_acquire) {
3885 rbd_warn(rbd_dev, "exclusive lock required");
3886 return -EROFS;
3887 }
Ilya Dryomoved95b212016-08-12 16:40:02 +02003888
3889 do {
3890 /*
3891 * Note the use of mod_delayed_work() in rbd_acquire_lock()
3892 * and cancel_delayed_work() in wake_requests().
3893 */
3894 dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
3895 queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
3896 prepare_to_wait_exclusive(&rbd_dev->lock_waitq, &wait,
3897 TASK_UNINTERRUPTIBLE);
3898 up_read(&rbd_dev->lock_rwsem);
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003899 timeout = schedule_timeout(ceph_timeout_jiffies(
3900 rbd_dev->opts->lock_timeout));
Ilya Dryomoved95b212016-08-12 16:40:02 +02003901 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003902 if (test_bit(RBD_DEV_FLAG_BLACKLISTED, &rbd_dev->flags)) {
3903 ret = -EBLACKLISTED;
3904 break;
3905 }
Dongsheng Yang34f55d02018-03-26 10:22:55 -04003906 if (!timeout) {
3907 rbd_warn(rbd_dev, "timed out waiting for lock");
3908 ret = -ETIMEDOUT;
3909 break;
3910 }
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003911 } while (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED);
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02003912
Ilya Dryomoved95b212016-08-12 16:40:02 +02003913 finish_wait(&rbd_dev->lock_waitq, &wait);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02003914 return ret;
Ilya Dryomoved95b212016-08-12 16:40:02 +02003915}
3916
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003917static void rbd_queue_workfn(struct work_struct *work)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003918{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003919 struct request *rq = blk_mq_rq_from_pdu(work);
3920 struct rbd_device *rbd_dev = rq->q->queuedata;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003921 struct rbd_img_request *img_request;
Josh Durgin4e752f02014-04-08 11:12:11 -07003922 struct ceph_snap_context *snapc = NULL;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003923 u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
3924 u64 length = blk_rq_bytes(rq);
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003925 enum obj_operation_type op_type;
Josh Durgin4e752f02014-04-08 11:12:11 -07003926 u64 mapping_size;
Ilya Dryomov80de1912016-09-20 14:23:17 +02003927 bool must_be_locked;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003928 int result;
3929
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003930 switch (req_op(rq)) {
3931 case REQ_OP_DISCARD:
3932 op_type = OBJ_OP_DISCARD;
3933 break;
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01003934 case REQ_OP_WRITE_ZEROES:
3935 op_type = OBJ_OP_ZEROOUT;
3936 break;
Christoph Hellwigaebf5262017-01-31 16:57:31 +01003937 case REQ_OP_WRITE:
3938 op_type = OBJ_OP_WRITE;
3939 break;
3940 case REQ_OP_READ:
3941 op_type = OBJ_OP_READ;
3942 break;
3943 default:
3944 dout("%s: non-fs request type %d\n", __func__, req_op(rq));
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003945 result = -EIO;
3946 goto err;
3947 }
3948
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003949 /* Ignore/skip any zero-length requests */
3950
3951 if (!length) {
3952 dout("%s: zero-length request\n", __func__);
3953 result = 0;
3954 goto err_rq;
3955 }
3956
Ilya Dryomovb91a7bd2019-05-03 17:27:03 +02003957 if (op_type != OBJ_OP_READ && rbd_dev->spec->snap_id != CEPH_NOSNAP) {
3958 rbd_warn(rbd_dev, "%s on read-only snapshot",
3959 obj_op_name(op_type));
3960 result = -EIO;
3961 goto err;
3962 }
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003963
3964 /*
3965 * Quit early if the mapped snapshot no longer exists. It's
3966 * still possible the snapshot will have disappeared by the
3967 * time our request arrives at the osd, but there's no sense in
3968 * sending it if we already know.
3969 */
3970 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags)) {
3971 dout("request for non-existent snapshot");
3972 rbd_assert(rbd_dev->spec->snap_id != CEPH_NOSNAP);
3973 result = -ENXIO;
3974 goto err_rq;
3975 }
3976
3977 if (offset && length > U64_MAX - offset + 1) {
3978 rbd_warn(rbd_dev, "bad request range (%llu~%llu)", offset,
3979 length);
3980 result = -EINVAL;
3981 goto err_rq; /* Shouldn't happen */
3982 }
3983
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01003984 blk_mq_start_request(rq);
3985
Josh Durgin4e752f02014-04-08 11:12:11 -07003986 down_read(&rbd_dev->header_rwsem);
3987 mapping_size = rbd_dev->mapping.size;
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08003988 if (op_type != OBJ_OP_READ) {
Josh Durgin4e752f02014-04-08 11:12:11 -07003989 snapc = rbd_dev->header.snapc;
3990 ceph_get_snap_context(snapc);
3991 }
3992 up_read(&rbd_dev->header_rwsem);
3993
3994 if (offset + length > mapping_size) {
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003995 rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
Josh Durgin4e752f02014-04-08 11:12:11 -07003996 length, mapping_size);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04003997 result = -EIO;
3998 goto err_rq;
3999 }
4000
Ilya Dryomovf9bebd52017-04-13 12:17:39 +02004001 must_be_locked =
4002 (rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK) &&
4003 (op_type != OBJ_OP_READ || rbd_dev->opts->lock_on_read);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004004 if (must_be_locked) {
4005 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02004006 result = rbd_wait_state_locked(rbd_dev,
4007 !rbd_dev->opts->exclusive);
4008 if (result)
Ilya Dryomov87c0fde2016-09-29 13:41:05 +02004009 goto err_unlock;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004010 }
4011
Ilya Dryomovdfd98752018-02-06 19:26:35 +01004012 img_request = rbd_img_request_create(rbd_dev, op_type, snapc);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004013 if (!img_request) {
4014 result = -ENOMEM;
Ilya Dryomoved95b212016-08-12 16:40:02 +02004015 goto err_unlock;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004016 }
4017 img_request->rq = rq;
Ilya Dryomov70b16db2015-11-27 19:23:24 +01004018 snapc = NULL; /* img_request consumes a ref */
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004019
Ilya Dryomov6484cbe2019-01-29 12:46:25 +01004020 if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
Ilya Dryomov5a237812018-02-06 19:26:34 +01004021 result = rbd_img_fill_nodata(img_request, offset, length);
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004022 else
Ilya Dryomov5a237812018-02-06 19:26:34 +01004023 result = rbd_img_fill_from_bio(img_request, offset, length,
4024 rq->bio);
Ilya Dryomov0192ce22019-05-16 15:06:56 +02004025 if (result)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004026 goto err_img_request;
4027
Ilya Dryomov0192ce22019-05-16 15:06:56 +02004028 rbd_img_handle_request(img_request, 0);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004029 if (must_be_locked)
4030 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004031 return;
4032
4033err_img_request:
4034 rbd_img_request_put(img_request);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004035err_unlock:
4036 if (must_be_locked)
4037 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004038err_rq:
4039 if (result)
4040 rbd_warn(rbd_dev, "%s %llx at %llx result %d",
Guangliang Zhao6d2940c2014-03-13 11:21:35 +08004041 obj_op_name(op_type), length, offset, result);
SF Markus Elfringe96a6502014-11-02 15:20:59 +01004042 ceph_put_snap_context(snapc);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004043err:
Christoph Hellwig2a842ac2017-06-03 09:38:04 +02004044 blk_mq_end_request(rq, errno_to_blk_status(result));
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004045}
4046
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004047static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004048 const struct blk_mq_queue_data *bd)
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004049{
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004050 struct request *rq = bd->rq;
4051 struct work_struct *work = blk_mq_rq_to_pdu(rq);
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04004052
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004053 queue_work(rbd_wq, work);
Christoph Hellwigfc17b652017-06-03 09:38:05 +02004054 return BLK_STS_OK;
Alex Elderbf0d5f502012-11-22 00:00:08 -06004055}
4056
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004057static void rbd_free_disk(struct rbd_device *rbd_dev)
4058{
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004059 blk_cleanup_queue(rbd_dev->disk->queue);
4060 blk_mq_free_tag_set(&rbd_dev->tag_set);
4061 put_disk(rbd_dev->disk);
Alex Eldera0cab922013-04-25 23:15:08 -05004062 rbd_dev->disk = NULL;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004063}
4064
Alex Elder788e2df2013-01-17 12:25:27 -06004065static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004066 struct ceph_object_id *oid,
4067 struct ceph_object_locator *oloc,
4068 void *buf, int buf_len)
Alex Elder788e2df2013-01-17 12:25:27 -06004069
4070{
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004071 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4072 struct ceph_osd_request *req;
4073 struct page **pages;
4074 int num_pages = calc_pages_for(0, buf_len);
Alex Elder788e2df2013-01-17 12:25:27 -06004075 int ret;
4076
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004077 req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
4078 if (!req)
4079 return -ENOMEM;
Alex Elder788e2df2013-01-17 12:25:27 -06004080
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004081 ceph_oid_copy(&req->r_base_oid, oid);
4082 ceph_oloc_copy(&req->r_base_oloc, oloc);
4083 req->r_flags = CEPH_OSD_FLAG_READ;
Alex Elder788e2df2013-01-17 12:25:27 -06004084
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004085 pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
4086 if (IS_ERR(pages)) {
4087 ret = PTR_ERR(pages);
4088 goto out_req;
4089 }
Alex Elder1ceae7e2013-02-06 13:11:38 -06004090
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004091 osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
4092 osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
4093 true);
Alex Elder788e2df2013-01-17 12:25:27 -06004094
Ilya Dryomov26f887e2018-10-15 16:11:37 +02004095 ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
4096 if (ret)
4097 goto out_req;
4098
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004099 ceph_osdc_start_request(osdc, req, false);
4100 ret = ceph_osdc_wait_request(osdc, req);
4101 if (ret >= 0)
4102 ceph_copy_from_page_vector(pages, buf, 0, ret);
4103
4104out_req:
4105 ceph_osdc_put_request(req);
Alex Elder788e2df2013-01-17 12:25:27 -06004106 return ret;
4107}
4108
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004109/*
Alex Elder662518b2013-05-06 09:51:29 -05004110 * Read the complete header for the given rbd device. On successful
4111 * return, the rbd_dev->header field will contain up-to-date
4112 * information about the image.
Alex Elder4156d992012-08-02 11:29:46 -05004113 */
Alex Elder99a41eb2013-05-06 09:51:30 -05004114static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev)
Alex Elder4156d992012-08-02 11:29:46 -05004115{
4116 struct rbd_image_header_ondisk *ondisk = NULL;
4117 u32 snap_count = 0;
4118 u64 names_size = 0;
4119 u32 want_count;
4120 int ret;
4121
4122 /*
4123 * The complete header will include an array of its 64-bit
4124 * snapshot ids, followed by the names of those snapshots as
4125 * a contiguous block of NUL-terminated strings. Note that
4126 * the number of snapshots could change by the time we read
4127 * it in, in which case we re-read it.
4128 */
4129 do {
4130 size_t size;
4131
4132 kfree(ondisk);
4133
4134 size = sizeof (*ondisk);
4135 size += snap_count * sizeof (struct rbd_image_snap_ondisk);
4136 size += names_size;
4137 ondisk = kmalloc(size, GFP_KERNEL);
4138 if (!ondisk)
Alex Elder662518b2013-05-06 09:51:29 -05004139 return -ENOMEM;
Alex Elder4156d992012-08-02 11:29:46 -05004140
Ilya Dryomovfe5478e2017-01-25 18:16:21 +01004141 ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
4142 &rbd_dev->header_oloc, ondisk, size);
Alex Elder4156d992012-08-02 11:29:46 -05004143 if (ret < 0)
Alex Elder662518b2013-05-06 09:51:29 -05004144 goto out;
Alex Elderc0cd10db2013-04-26 09:43:47 -05004145 if ((size_t)ret < size) {
Alex Elder4156d992012-08-02 11:29:46 -05004146 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004147 rbd_warn(rbd_dev, "short header read (want %zd got %d)",
4148 size, ret);
Alex Elder662518b2013-05-06 09:51:29 -05004149 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004150 }
4151 if (!rbd_dev_ondisk_valid(ondisk)) {
4152 ret = -ENXIO;
Alex Elder06ecc6c2012-11-01 10:17:15 -05004153 rbd_warn(rbd_dev, "invalid header");
Alex Elder662518b2013-05-06 09:51:29 -05004154 goto out;
Alex Elder4156d992012-08-02 11:29:46 -05004155 }
4156
4157 names_size = le64_to_cpu(ondisk->snap_names_len);
4158 want_count = snap_count;
4159 snap_count = le32_to_cpu(ondisk->snap_count);
4160 } while (snap_count != want_count);
4161
Alex Elder662518b2013-05-06 09:51:29 -05004162 ret = rbd_header_from_disk(rbd_dev, ondisk);
4163out:
Alex Elder4156d992012-08-02 11:29:46 -05004164 kfree(ondisk);
4165
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004166 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004167}
4168
Alex Elder15228ed2013-05-01 12:43:03 -05004169/*
4170 * Clear the rbd device's EXISTS flag if the snapshot it's mapped to
4171 * has disappeared from the (just updated) snapshot context.
4172 */
4173static void rbd_exists_validate(struct rbd_device *rbd_dev)
4174{
4175 u64 snap_id;
4176
4177 if (!test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags))
4178 return;
4179
4180 snap_id = rbd_dev->spec->snap_id;
4181 if (snap_id == CEPH_NOSNAP)
4182 return;
4183
4184 if (rbd_dev_snap_index(rbd_dev, snap_id) == BAD_SNAP_INDEX)
4185 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
4186}
4187
Josh Durgin98752012013-08-29 17:26:31 -07004188static void rbd_dev_update_size(struct rbd_device *rbd_dev)
4189{
4190 sector_t size;
Josh Durgin98752012013-08-29 17:26:31 -07004191
4192 /*
Ilya Dryomov811c6682016-04-15 16:22:16 +02004193 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
4194 * try to update its size. If REMOVING is set, updating size
4195 * is just useless work since the device can't be opened.
Josh Durgin98752012013-08-29 17:26:31 -07004196 */
Ilya Dryomov811c6682016-04-15 16:22:16 +02004197 if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
4198 !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
Josh Durgin98752012013-08-29 17:26:31 -07004199 size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
4200 dout("setting size to %llu sectors", (unsigned long long)size);
4201 set_capacity(rbd_dev->disk, size);
4202 revalidate_disk(rbd_dev->disk);
4203 }
4204}
4205
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004206static int rbd_dev_refresh(struct rbd_device *rbd_dev)
Alex Elder1fe5e992012-07-25 09:32:41 -05004207{
Alex Eldere627db02013-05-06 07:40:30 -05004208 u64 mapping_size;
Alex Elder1fe5e992012-07-25 09:32:41 -05004209 int ret;
4210
Alex Eldercfbf6372013-05-31 17:40:45 -05004211 down_write(&rbd_dev->header_rwsem);
Alex Elder3b5cf2a2013-05-29 11:18:59 -05004212 mapping_size = rbd_dev->mapping.size;
Ilya Dryomova720ae02014-07-23 17:11:19 +04004213
4214 ret = rbd_dev_header_info(rbd_dev);
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004215 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004216 goto out;
Alex Elder15228ed2013-05-01 12:43:03 -05004217
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004218 /*
4219 * If there is a parent, see if it has disappeared due to the
4220 * mapped image getting flattened.
4221 */
4222 if (rbd_dev->parent) {
4223 ret = rbd_dev_v2_parent_info(rbd_dev);
4224 if (ret)
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004225 goto out;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04004226 }
4227
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004228 if (rbd_dev->spec->snap_id == CEPH_NOSNAP) {
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004229 rbd_dev->mapping.size = rbd_dev->header.image_size;
Ilya Dryomov5ff11082014-07-23 17:11:21 +04004230 } else {
4231 /* validate mapped snapshot's EXISTS flag */
4232 rbd_exists_validate(rbd_dev);
4233 }
Alex Elder15228ed2013-05-01 12:43:03 -05004234
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004235out:
Alex Eldercfbf6372013-05-31 17:40:45 -05004236 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004237 if (!ret && mapping_size != rbd_dev->mapping.size)
Josh Durgin98752012013-08-29 17:26:31 -07004238 rbd_dev_update_size(rbd_dev);
Alex Elder1fe5e992012-07-25 09:32:41 -05004239
Ilya Dryomov73e39e42015-01-08 20:18:22 +03004240 return ret;
Alex Elder1fe5e992012-07-25 09:32:41 -05004241}
4242
Christoph Hellwigd6296d392017-05-01 10:19:08 -06004243static int rbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
4244 unsigned int hctx_idx, unsigned int numa_node)
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004245{
4246 struct work_struct *work = blk_mq_rq_to_pdu(rq);
4247
4248 INIT_WORK(work, rbd_queue_workfn);
4249 return 0;
4250}
4251
Eric Biggersf363b082017-03-30 13:39:16 -07004252static const struct blk_mq_ops rbd_mq_ops = {
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004253 .queue_rq = rbd_queue_rq,
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004254 .init_request = rbd_init_request,
4255};
4256
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004257static int rbd_init_disk(struct rbd_device *rbd_dev)
4258{
4259 struct gendisk *disk;
4260 struct request_queue *q;
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004261 unsigned int objset_bytes =
4262 rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004263 int err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004264
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004265 /* create gendisk info */
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004266 disk = alloc_disk(single_major ?
4267 (1 << RBD_SINGLE_MAJOR_PART_SHIFT) :
4268 RBD_MINORS_PER_MAJOR);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004269 if (!disk)
Alex Elder1fcdb8a2012-08-29 17:11:06 -05004270 return -ENOMEM;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004271
Alex Elderf0f8cef2012-01-29 13:57:44 -06004272 snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
Alex Elderde71a292012-07-03 16:01:19 -05004273 rbd_dev->dev_id);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004274 disk->major = rbd_dev->major;
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004275 disk->first_minor = rbd_dev->minor;
Ilya Dryomov7e513d42013-12-16 19:26:32 +02004276 if (single_major)
4277 disk->flags |= GENHD_FL_EXT_DEVT;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004278 disk->fops = &rbd_bd_ops;
4279 disk->private_data = rbd_dev;
4280
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004281 memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
4282 rbd_dev->tag_set.ops = &rbd_mq_ops;
Ilya Dryomovb5584182015-06-23 16:21:19 +03004283 rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004284 rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
Ming Lei56d18f62019-02-15 19:13:24 +08004285 rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004286 rbd_dev->tag_set.nr_hw_queues = 1;
4287 rbd_dev->tag_set.cmd_size = sizeof(struct work_struct);
4288
4289 err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
4290 if (err)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004291 goto out_disk;
Josh Durgin029bcbd2011-07-22 11:35:23 -07004292
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004293 q = blk_mq_init_queue(&rbd_dev->tag_set);
4294 if (IS_ERR(q)) {
4295 err = PTR_ERR(q);
4296 goto out_tag_set;
4297 }
4298
Bart Van Assche8b904b52018-03-07 17:10:10 -08004299 blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
Ilya Dryomovd8a2c892015-03-24 16:15:17 +03004300 /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
Alex Elder593a9e72012-02-07 12:03:37 -06004301
Ilya Dryomov420efbd2018-04-16 09:32:18 +02004302 blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
Ilya Dryomov0d9fde42015-10-07 16:09:35 +02004303 q->limits.max_sectors = queue_max_hw_sectors(q);
Ilya Dryomov21acdf42017-12-21 15:35:11 +01004304 blk_queue_max_segments(q, USHRT_MAX);
Ilya Dryomov24f1df62018-01-12 17:22:10 +01004305 blk_queue_max_segment_size(q, UINT_MAX);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004306 blk_queue_io_min(q, rbd_dev->opts->alloc_size);
4307 blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
Josh Durgin029bcbd2011-07-22 11:35:23 -07004308
Ilya Dryomovd9360542018-03-23 06:14:47 +01004309 if (rbd_dev->opts->trim) {
4310 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
Ilya Dryomov16d80c52019-03-15 14:50:04 +01004311 q->limits.discard_granularity = rbd_dev->opts->alloc_size;
Ilya Dryomovd9360542018-03-23 06:14:47 +01004312 blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
4313 blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
4314 }
Guangliang Zhao90e98c52014-04-01 22:22:16 +08004315
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004316 if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
Jan Karadc3b17c2017-02-02 15:56:50 +01004317 q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
Ronny Hegewaldbae818e2015-10-15 18:50:46 +00004318
Ilya Dryomov5769ed02017-04-13 12:17:38 +02004319 /*
4320 * disk_release() expects a queue ref from add_disk() and will
4321 * put it. Hold an extra ref until add_disk() is called.
4322 */
4323 WARN_ON(!blk_get_queue(q));
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004324 disk->queue = q;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004325 q->queuedata = rbd_dev;
4326
4327 rbd_dev->disk = disk;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004328
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004329 return 0;
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004330out_tag_set:
4331 blk_mq_free_tag_set(&rbd_dev->tag_set);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004332out_disk:
4333 put_disk(disk);
Christoph Hellwig7ad18af2015-01-13 17:20:04 +01004334 return err;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004335}
4336
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004337/*
4338 sysfs
4339*/
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004340
Alex Elder593a9e72012-02-07 12:03:37 -06004341static struct rbd_device *dev_to_rbd_dev(struct device *dev)
4342{
4343 return container_of(dev, struct rbd_device, dev);
4344}
4345
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004346static ssize_t rbd_size_show(struct device *dev,
4347 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004348{
Alex Elder593a9e72012-02-07 12:03:37 -06004349 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004350
Alex Elderfc71d832013-04-26 15:44:36 -05004351 return sprintf(buf, "%llu\n",
4352 (unsigned long long)rbd_dev->mapping.size);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004353}
4354
Alex Elder34b13182012-07-13 20:35:12 -05004355/*
4356 * Note this shows the features for whatever's mapped, which is not
4357 * necessarily the base image.
4358 */
4359static ssize_t rbd_features_show(struct device *dev,
4360 struct device_attribute *attr, char *buf)
4361{
4362 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4363
4364 return sprintf(buf, "0x%016llx\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004365 (unsigned long long)rbd_dev->mapping.features);
Alex Elder34b13182012-07-13 20:35:12 -05004366}
4367
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004368static ssize_t rbd_major_show(struct device *dev,
4369 struct device_attribute *attr, char *buf)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004370{
Alex Elder593a9e72012-02-07 12:03:37 -06004371 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004372
Alex Elderfc71d832013-04-26 15:44:36 -05004373 if (rbd_dev->major)
4374 return sprintf(buf, "%d\n", rbd_dev->major);
4375
4376 return sprintf(buf, "(none)\n");
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004377}
Alex Elderfc71d832013-04-26 15:44:36 -05004378
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004379static ssize_t rbd_minor_show(struct device *dev,
4380 struct device_attribute *attr, char *buf)
4381{
4382 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4383
4384 return sprintf(buf, "%d\n", rbd_dev->minor);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004385}
4386
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004387static ssize_t rbd_client_addr_show(struct device *dev,
4388 struct device_attribute *attr, char *buf)
4389{
4390 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4391 struct ceph_entity_addr *client_addr =
4392 ceph_client_addr(rbd_dev->rbd_client->client);
4393
4394 return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
4395 le32_to_cpu(client_addr->nonce));
4396}
4397
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004398static ssize_t rbd_client_id_show(struct device *dev,
4399 struct device_attribute *attr, char *buf)
4400{
Alex Elder593a9e72012-02-07 12:03:37 -06004401 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004402
Alex Elder1dbb4392012-01-24 10:08:37 -06004403 return sprintf(buf, "client%lld\n",
Ilya Dryomov033268a2016-08-12 14:59:58 +02004404 ceph_client_gid(rbd_dev->rbd_client->client));
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004405}
4406
Mike Christie267fb902016-08-18 18:38:43 +02004407static ssize_t rbd_cluster_fsid_show(struct device *dev,
4408 struct device_attribute *attr, char *buf)
4409{
4410 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4411
4412 return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
4413}
4414
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004415static ssize_t rbd_config_info_show(struct device *dev,
4416 struct device_attribute *attr, char *buf)
4417{
4418 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4419
4420 return sprintf(buf, "%s\n", rbd_dev->config_info);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004421}
4422
4423static ssize_t rbd_pool_show(struct device *dev,
4424 struct device_attribute *attr, char *buf)
4425{
Alex Elder593a9e72012-02-07 12:03:37 -06004426 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004427
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004428 return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004429}
4430
Alex Elder9bb2f332012-07-12 10:46:35 -05004431static ssize_t rbd_pool_id_show(struct device *dev,
4432 struct device_attribute *attr, char *buf)
4433{
4434 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4435
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004436 return sprintf(buf, "%llu\n",
Alex Elderfc71d832013-04-26 15:44:36 -05004437 (unsigned long long) rbd_dev->spec->pool_id);
Alex Elder9bb2f332012-07-12 10:46:35 -05004438}
4439
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004440static ssize_t rbd_pool_ns_show(struct device *dev,
4441 struct device_attribute *attr, char *buf)
4442{
4443 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4444
4445 return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
4446}
4447
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004448static ssize_t rbd_name_show(struct device *dev,
4449 struct device_attribute *attr, char *buf)
4450{
Alex Elder593a9e72012-02-07 12:03:37 -06004451 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004452
Alex Eldera92ffdf2012-10-30 19:40:33 -05004453 if (rbd_dev->spec->image_name)
4454 return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
4455
4456 return sprintf(buf, "(unknown)\n");
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004457}
4458
Alex Elder589d30e2012-07-10 20:30:11 -05004459static ssize_t rbd_image_id_show(struct device *dev,
4460 struct device_attribute *attr, char *buf)
4461{
4462 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4463
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004464 return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05004465}
4466
Alex Elder34b13182012-07-13 20:35:12 -05004467/*
4468 * Shows the name of the currently-mapped snapshot (or
4469 * RBD_SNAP_HEAD_NAME for the base image).
4470 */
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004471static ssize_t rbd_snap_show(struct device *dev,
4472 struct device_attribute *attr,
4473 char *buf)
4474{
Alex Elder593a9e72012-02-07 12:03:37 -06004475 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004476
Alex Elder0d7dbfc2012-10-25 23:34:41 -05004477 return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004478}
4479
Mike Christie92a58672016-08-18 18:38:44 +02004480static ssize_t rbd_snap_id_show(struct device *dev,
4481 struct device_attribute *attr, char *buf)
4482{
4483 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4484
4485 return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
4486}
4487
Alex Elder86b00e02012-10-25 23:34:42 -05004488/*
Ilya Dryomovff961282014-07-22 21:53:07 +04004489 * For a v2 image, shows the chain of parent images, separated by empty
4490 * lines. For v1 images or if there is no parent, shows "(no parent
4491 * image)".
Alex Elder86b00e02012-10-25 23:34:42 -05004492 */
4493static ssize_t rbd_parent_show(struct device *dev,
Ilya Dryomovff961282014-07-22 21:53:07 +04004494 struct device_attribute *attr,
4495 char *buf)
Alex Elder86b00e02012-10-25 23:34:42 -05004496{
4497 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Ilya Dryomovff961282014-07-22 21:53:07 +04004498 ssize_t count = 0;
Alex Elder86b00e02012-10-25 23:34:42 -05004499
Ilya Dryomovff961282014-07-22 21:53:07 +04004500 if (!rbd_dev->parent)
Alex Elder86b00e02012-10-25 23:34:42 -05004501 return sprintf(buf, "(no parent image)\n");
4502
Ilya Dryomovff961282014-07-22 21:53:07 +04004503 for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
4504 struct rbd_spec *spec = rbd_dev->parent_spec;
Alex Elder86b00e02012-10-25 23:34:42 -05004505
Ilya Dryomovff961282014-07-22 21:53:07 +04004506 count += sprintf(&buf[count], "%s"
4507 "pool_id %llu\npool_name %s\n"
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004508 "pool_ns %s\n"
Ilya Dryomovff961282014-07-22 21:53:07 +04004509 "image_id %s\nimage_name %s\n"
4510 "snap_id %llu\nsnap_name %s\n"
4511 "overlap %llu\n",
4512 !count ? "" : "\n", /* first? */
4513 spec->pool_id, spec->pool_name,
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004514 spec->pool_ns ?: "",
Ilya Dryomovff961282014-07-22 21:53:07 +04004515 spec->image_id, spec->image_name ?: "(unknown)",
4516 spec->snap_id, spec->snap_name,
4517 rbd_dev->parent_overlap);
4518 }
Alex Elder86b00e02012-10-25 23:34:42 -05004519
Ilya Dryomovff961282014-07-22 21:53:07 +04004520 return count;
Alex Elder86b00e02012-10-25 23:34:42 -05004521}
4522
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004523static ssize_t rbd_image_refresh(struct device *dev,
4524 struct device_attribute *attr,
4525 const char *buf,
4526 size_t size)
4527{
Alex Elder593a9e72012-02-07 12:03:37 -06004528 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
Alex Elderb8136232012-07-25 09:32:41 -05004529 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004530
Alex Eldercc4a38bd2013-04-30 00:44:33 -05004531 ret = rbd_dev_refresh(rbd_dev);
Alex Eldere627db02013-05-06 07:40:30 -05004532 if (ret)
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004533 return ret;
Alex Elderb8136232012-07-25 09:32:41 -05004534
Ilya Dryomov52bb1f92014-07-23 17:11:20 +04004535 return size;
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004536}
Yehuda Sadeh602adf42010-08-12 16:11:25 -07004537
Joe Perches5657a812018-05-24 13:38:59 -06004538static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
4539static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
4540static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
4541static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
4542static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
4543static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
4544static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
4545static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
4546static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
4547static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004548static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
Joe Perches5657a812018-05-24 13:38:59 -06004549static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
4550static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
4551static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
4552static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
4553static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
4554static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004555
4556static struct attribute *rbd_attrs[] = {
4557 &dev_attr_size.attr,
Alex Elder34b13182012-07-13 20:35:12 -05004558 &dev_attr_features.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004559 &dev_attr_major.attr,
Ilya Dryomovdd82fff2013-12-13 15:28:57 +02004560 &dev_attr_minor.attr,
Ilya Dryomov005a07bf2016-08-18 18:38:43 +02004561 &dev_attr_client_addr.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004562 &dev_attr_client_id.attr,
Mike Christie267fb902016-08-18 18:38:43 +02004563 &dev_attr_cluster_fsid.attr,
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004564 &dev_attr_config_info.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004565 &dev_attr_pool.attr,
Alex Elder9bb2f332012-07-12 10:46:35 -05004566 &dev_attr_pool_id.attr,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004567 &dev_attr_pool_ns.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004568 &dev_attr_name.attr,
Alex Elder589d30e2012-07-10 20:30:11 -05004569 &dev_attr_image_id.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004570 &dev_attr_current_snap.attr,
Mike Christie92a58672016-08-18 18:38:44 +02004571 &dev_attr_snap_id.attr,
Alex Elder86b00e02012-10-25 23:34:42 -05004572 &dev_attr_parent.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004573 &dev_attr_refresh.attr,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004574 NULL
4575};
4576
4577static struct attribute_group rbd_attr_group = {
4578 .attrs = rbd_attrs,
4579};
4580
4581static const struct attribute_group *rbd_attr_groups[] = {
4582 &rbd_attr_group,
4583 NULL
4584};
4585
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004586static void rbd_dev_release(struct device *dev);
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004587
Bhumika Goyalb9942bc2017-02-11 12:14:38 +05304588static const struct device_type rbd_device_type = {
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004589 .name = "rbd",
4590 .groups = rbd_attr_groups,
Ilya Dryomov6cac4692015-10-16 20:11:25 +02004591 .release = rbd_dev_release,
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004592};
4593
Alex Elder8b8fb992012-10-26 17:25:24 -05004594static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
4595{
4596 kref_get(&spec->kref);
4597
4598 return spec;
4599}
4600
4601static void rbd_spec_free(struct kref *kref);
4602static void rbd_spec_put(struct rbd_spec *spec)
4603{
4604 if (spec)
4605 kref_put(&spec->kref, rbd_spec_free);
4606}
4607
4608static struct rbd_spec *rbd_spec_alloc(void)
4609{
4610 struct rbd_spec *spec;
4611
4612 spec = kzalloc(sizeof (*spec), GFP_KERNEL);
4613 if (!spec)
4614 return NULL;
Ilya Dryomov04077592014-07-23 17:11:20 +04004615
4616 spec->pool_id = CEPH_NOPOOL;
4617 spec->snap_id = CEPH_NOSNAP;
Alex Elder8b8fb992012-10-26 17:25:24 -05004618 kref_init(&spec->kref);
4619
Alex Elder8b8fb992012-10-26 17:25:24 -05004620 return spec;
4621}
4622
4623static void rbd_spec_free(struct kref *kref)
4624{
4625 struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
4626
4627 kfree(spec->pool_name);
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004628 kfree(spec->pool_ns);
Alex Elder8b8fb992012-10-26 17:25:24 -05004629 kfree(spec->image_id);
4630 kfree(spec->image_name);
4631 kfree(spec->snap_name);
4632 kfree(spec);
4633}
4634
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004635static void rbd_dev_free(struct rbd_device *rbd_dev)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004636{
Ilya Dryomov99d16942016-08-12 16:11:41 +02004637 WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
Ilya Dryomoved95b212016-08-12 16:40:02 +02004638 WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004639
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004640 ceph_oid_destroy(&rbd_dev->header_oid);
Ilya Dryomov6b6dddb2016-08-05 16:15:38 +02004641 ceph_oloc_destroy(&rbd_dev->header_oloc);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02004642 kfree(rbd_dev->config_info);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004643
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004644 rbd_put_client(rbd_dev->rbd_client);
4645 rbd_spec_put(rbd_dev->spec);
4646 kfree(rbd_dev->opts);
4647 kfree(rbd_dev);
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004648}
4649
4650static void rbd_dev_release(struct device *dev)
4651{
4652 struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
4653 bool need_put = !!rbd_dev->opts;
4654
4655 if (need_put) {
4656 destroy_workqueue(rbd_dev->task_wq);
4657 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4658 }
4659
4660 rbd_dev_free(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004661
4662 /*
4663 * This is racy, but way better than putting module outside of
4664 * the release callback. The race window is pretty small, so
4665 * doing something similar to dm (dm-builtin.c) is overkill.
4666 */
4667 if (need_put)
4668 module_put(THIS_MODULE);
4669}
4670
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004671static struct rbd_device *__rbd_dev_create(struct rbd_client *rbdc,
4672 struct rbd_spec *spec)
Alex Elderc53d5892012-10-25 23:34:42 -05004673{
4674 struct rbd_device *rbd_dev;
4675
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004676 rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
Alex Elderc53d5892012-10-25 23:34:42 -05004677 if (!rbd_dev)
4678 return NULL;
4679
4680 spin_lock_init(&rbd_dev->lock);
4681 INIT_LIST_HEAD(&rbd_dev->node);
Alex Elderc53d5892012-10-25 23:34:42 -05004682 init_rwsem(&rbd_dev->header_rwsem);
4683
Ilya Dryomov7e973322017-01-25 18:16:22 +01004684 rbd_dev->header.data_pool_id = CEPH_NOPOOL;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004685 ceph_oid_init(&rbd_dev->header_oid);
Ilya Dryomov431a02c2017-01-25 18:16:21 +01004686 rbd_dev->header_oloc.pool = spec->pool_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02004687 if (spec->pool_ns) {
4688 WARN_ON(!*spec->pool_ns);
4689 rbd_dev->header_oloc.pool_ns =
4690 ceph_find_or_create_string(spec->pool_ns,
4691 strlen(spec->pool_ns));
4692 }
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02004693
Ilya Dryomov99d16942016-08-12 16:11:41 +02004694 mutex_init(&rbd_dev->watch_mutex);
4695 rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
4696 INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
4697
Ilya Dryomoved95b212016-08-12 16:40:02 +02004698 init_rwsem(&rbd_dev->lock_rwsem);
4699 rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
4700 INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
4701 INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
4702 INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
4703 INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
4704 init_waitqueue_head(&rbd_dev->lock_waitq);
4705
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004706 rbd_dev->dev.bus = &rbd_bus_type;
4707 rbd_dev->dev.type = &rbd_device_type;
4708 rbd_dev->dev.parent = &rbd_root_dev;
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004709 device_initialize(&rbd_dev->dev);
4710
Alex Elderc53d5892012-10-25 23:34:42 -05004711 rbd_dev->rbd_client = rbdc;
Ilya Dryomovd1475432015-06-22 13:24:48 +03004712 rbd_dev->spec = spec;
Alex Elder0903e872012-11-14 12:25:19 -06004713
Alex Elderc53d5892012-10-25 23:34:42 -05004714 return rbd_dev;
4715}
4716
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02004717/*
4718 * Create a mapping rbd_dev.
4719 */
4720static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
4721 struct rbd_spec *spec,
4722 struct rbd_options *opts)
4723{
4724 struct rbd_device *rbd_dev;
4725
4726 rbd_dev = __rbd_dev_create(rbdc, spec);
4727 if (!rbd_dev)
4728 return NULL;
4729
4730 rbd_dev->opts = opts;
4731
4732 /* get an id and fill in device name */
4733 rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
4734 minor_to_rbd_dev_id(1 << MINORBITS),
4735 GFP_KERNEL);
4736 if (rbd_dev->dev_id < 0)
4737 goto fail_rbd_dev;
4738
4739 sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
4740 rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
4741 rbd_dev->name);
4742 if (!rbd_dev->task_wq)
4743 goto fail_dev_id;
4744
4745 /* we have a ref from do_rbd_add() */
4746 __module_get(THIS_MODULE);
4747
4748 dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
4749 return rbd_dev;
4750
4751fail_dev_id:
4752 ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
4753fail_rbd_dev:
4754 rbd_dev_free(rbd_dev);
4755 return NULL;
4756}
4757
Alex Elderc53d5892012-10-25 23:34:42 -05004758static void rbd_dev_destroy(struct rbd_device *rbd_dev)
4759{
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02004760 if (rbd_dev)
4761 put_device(&rbd_dev->dev);
Alex Elderc53d5892012-10-25 23:34:42 -05004762}
4763
Yehuda Sadehdfc56062010-11-19 14:51:04 -08004764/*
Alex Elder9d475de2012-07-03 16:01:19 -05004765 * Get the size and object order for an image snapshot, or if
4766 * snap_id is CEPH_NOSNAP, gets this information for the base
4767 * image.
4768 */
4769static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
4770 u8 *order, u64 *snap_size)
4771{
4772 __le64 snapid = cpu_to_le64(snap_id);
4773 int ret;
4774 struct {
4775 u8 order;
4776 __le64 size;
4777 } __attribute__ ((packed)) size_buf = { 0 };
4778
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004779 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4780 &rbd_dev->header_oloc, "get_size",
4781 &snapid, sizeof(snapid),
4782 &size_buf, sizeof(size_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004783 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder9d475de2012-07-03 16:01:19 -05004784 if (ret < 0)
4785 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004786 if (ret < sizeof (size_buf))
4787 return -ERANGE;
Alex Elder9d475de2012-07-03 16:01:19 -05004788
Josh Durginc3545572013-08-28 17:08:10 -07004789 if (order) {
Alex Elderc86f86e2013-04-25 15:09:41 -05004790 *order = size_buf.order;
Josh Durginc3545572013-08-28 17:08:10 -07004791 dout(" order %u", (unsigned int)*order);
4792 }
Alex Elder9d475de2012-07-03 16:01:19 -05004793 *snap_size = le64_to_cpu(size_buf.size);
4794
Josh Durginc3545572013-08-28 17:08:10 -07004795 dout(" snap_id 0x%016llx snap_size = %llu\n",
4796 (unsigned long long)snap_id,
Alex Elder57385b52013-04-21 12:14:45 -05004797 (unsigned long long)*snap_size);
Alex Elder9d475de2012-07-03 16:01:19 -05004798
4799 return 0;
4800}
4801
4802static int rbd_dev_v2_image_size(struct rbd_device *rbd_dev)
4803{
4804 return _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
4805 &rbd_dev->header.obj_order,
4806 &rbd_dev->header.image_size);
4807}
4808
Alex Elder1e130192012-07-03 16:01:19 -05004809static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev)
4810{
4811 void *reply_buf;
4812 int ret;
4813 void *p;
4814
4815 reply_buf = kzalloc(RBD_OBJ_PREFIX_LEN_MAX, GFP_KERNEL);
4816 if (!reply_buf)
4817 return -ENOMEM;
4818
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004819 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4820 &rbd_dev->header_oloc, "get_object_prefix",
4821 NULL, 0, reply_buf, RBD_OBJ_PREFIX_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06004822 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder1e130192012-07-03 16:01:19 -05004823 if (ret < 0)
4824 goto out;
4825
4826 p = reply_buf;
4827 rbd_dev->header.object_prefix = ceph_extract_encoded_string(&p,
Alex Elder57385b52013-04-21 12:14:45 -05004828 p + ret, NULL, GFP_NOIO);
4829 ret = 0;
Alex Elder1e130192012-07-03 16:01:19 -05004830
4831 if (IS_ERR(rbd_dev->header.object_prefix)) {
4832 ret = PTR_ERR(rbd_dev->header.object_prefix);
4833 rbd_dev->header.object_prefix = NULL;
4834 } else {
4835 dout(" object_prefix = %s\n", rbd_dev->header.object_prefix);
4836 }
Alex Elder1e130192012-07-03 16:01:19 -05004837out:
4838 kfree(reply_buf);
4839
4840 return ret;
4841}
4842
Alex Elderb1b54022012-07-03 16:01:19 -05004843static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
4844 u64 *snap_features)
4845{
4846 __le64 snapid = cpu_to_le64(snap_id);
4847 struct {
4848 __le64 features;
4849 __le64 incompat;
Alex Elder41579762013-04-21 12:14:45 -05004850 } __attribute__ ((packed)) features_buf = { 0 };
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004851 u64 unsup;
Alex Elderb1b54022012-07-03 16:01:19 -05004852 int ret;
4853
Ilya Dryomovecd4a682017-01-25 18:16:21 +01004854 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
4855 &rbd_dev->header_oloc, "get_features",
4856 &snapid, sizeof(snapid),
4857 &features_buf, sizeof(features_buf));
Alex Elder36be9a72013-01-19 00:30:28 -06004858 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderb1b54022012-07-03 16:01:19 -05004859 if (ret < 0)
4860 return ret;
Alex Elder57385b52013-04-21 12:14:45 -05004861 if (ret < sizeof (features_buf))
4862 return -ERANGE;
Alex Elderd8891402012-10-09 13:50:17 -07004863
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004864 unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
4865 if (unsup) {
4866 rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
4867 unsup);
Alex Elderb8f5c6e2012-11-01 08:39:26 -05004868 return -ENXIO;
Ilya Dryomovd3767f02016-04-13 14:15:50 +02004869 }
Alex Elderd8891402012-10-09 13:50:17 -07004870
Alex Elderb1b54022012-07-03 16:01:19 -05004871 *snap_features = le64_to_cpu(features_buf.features);
4872
4873 dout(" snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
Alex Elder57385b52013-04-21 12:14:45 -05004874 (unsigned long long)snap_id,
4875 (unsigned long long)*snap_features,
4876 (unsigned long long)le64_to_cpu(features_buf.incompat));
Alex Elderb1b54022012-07-03 16:01:19 -05004877
4878 return 0;
4879}
4880
4881static int rbd_dev_v2_features(struct rbd_device *rbd_dev)
4882{
4883 return _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
4884 &rbd_dev->header.features);
4885}
4886
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004887struct parent_image_info {
4888 u64 pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004889 const char *pool_ns;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004890 const char *image_id;
4891 u64 snap_id;
4892
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004893 bool has_overlap;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004894 u64 overlap;
4895};
4896
4897/*
4898 * The caller is responsible for @pii.
4899 */
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02004900static int decode_parent_image_spec(void **p, void *end,
4901 struct parent_image_info *pii)
4902{
4903 u8 struct_v;
4904 u32 struct_len;
4905 int ret;
4906
4907 ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
4908 &struct_v, &struct_len);
4909 if (ret)
4910 return ret;
4911
4912 ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
4913 pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4914 if (IS_ERR(pii->pool_ns)) {
4915 ret = PTR_ERR(pii->pool_ns);
4916 pii->pool_ns = NULL;
4917 return ret;
4918 }
4919 pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
4920 if (IS_ERR(pii->image_id)) {
4921 ret = PTR_ERR(pii->image_id);
4922 pii->image_id = NULL;
4923 return ret;
4924 }
4925 ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
4926 return 0;
4927
4928e_inval:
4929 return -EINVAL;
4930}
4931
4932static int __get_parent_info(struct rbd_device *rbd_dev,
4933 struct page *req_page,
4934 struct page *reply_page,
4935 struct parent_image_info *pii)
4936{
4937 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4938 size_t reply_len = PAGE_SIZE;
4939 void *p, *end;
4940 int ret;
4941
4942 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4943 "rbd", "parent_get", CEPH_OSD_FLAG_READ,
4944 req_page, sizeof(u64), reply_page, &reply_len);
4945 if (ret)
4946 return ret == -EOPNOTSUPP ? 1 : ret;
4947
4948 p = page_address(reply_page);
4949 end = p + reply_len;
4950 ret = decode_parent_image_spec(&p, end, pii);
4951 if (ret)
4952 return ret;
4953
4954 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4955 "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
4956 req_page, sizeof(u64), reply_page, &reply_len);
4957 if (ret)
4958 return ret;
4959
4960 p = page_address(reply_page);
4961 end = p + reply_len;
4962 ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
4963 if (pii->has_overlap)
4964 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
4965
4966 return 0;
4967
4968e_inval:
4969 return -EINVAL;
4970}
4971
4972/*
4973 * The caller is responsible for @pii.
4974 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02004975static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
4976 struct page *req_page,
4977 struct page *reply_page,
4978 struct parent_image_info *pii)
4979{
4980 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
4981 size_t reply_len = PAGE_SIZE;
4982 void *p, *end;
4983 int ret;
4984
4985 ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
4986 "rbd", "get_parent", CEPH_OSD_FLAG_READ,
4987 req_page, sizeof(u64), reply_page, &reply_len);
4988 if (ret)
4989 return ret;
4990
4991 p = page_address(reply_page);
4992 end = p + reply_len;
4993 ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
4994 pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
4995 if (IS_ERR(pii->image_id)) {
4996 ret = PTR_ERR(pii->image_id);
4997 pii->image_id = NULL;
4998 return ret;
4999 }
5000 ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005001 pii->has_overlap = true;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005002 ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
5003
5004 return 0;
5005
5006e_inval:
5007 return -EINVAL;
5008}
5009
5010static int get_parent_info(struct rbd_device *rbd_dev,
5011 struct parent_image_info *pii)
5012{
5013 struct page *req_page, *reply_page;
5014 void *p;
5015 int ret;
5016
5017 req_page = alloc_page(GFP_KERNEL);
5018 if (!req_page)
5019 return -ENOMEM;
5020
5021 reply_page = alloc_page(GFP_KERNEL);
5022 if (!reply_page) {
5023 __free_page(req_page);
5024 return -ENOMEM;
5025 }
5026
5027 p = page_address(req_page);
5028 ceph_encode_64(&p, rbd_dev->spec->snap_id);
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005029 ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
5030 if (ret > 0)
5031 ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
5032 pii);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005033
5034 __free_page(req_page);
5035 __free_page(reply_page);
5036 return ret;
5037}
5038
Alex Elder86b00e02012-10-25 23:34:42 -05005039static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev)
5040{
5041 struct rbd_spec *parent_spec;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005042 struct parent_image_info pii = { 0 };
Alex Elder86b00e02012-10-25 23:34:42 -05005043 int ret;
5044
5045 parent_spec = rbd_spec_alloc();
5046 if (!parent_spec)
5047 return -ENOMEM;
5048
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005049 ret = get_parent_info(rbd_dev, &pii);
5050 if (ret)
Alex Elder86b00e02012-10-25 23:34:42 -05005051 goto out_err;
5052
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005053 dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
5054 __func__, pii.pool_id, pii.pool_ns, pii.image_id, pii.snap_id,
5055 pii.has_overlap, pii.overlap);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005056
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005057 if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap) {
Alex Elder392a9da2013-05-06 17:40:33 -05005058 /*
5059 * Either the parent never existed, or we have
5060 * record of it but the image got flattened so it no
5061 * longer has a parent. When the parent of a
5062 * layered image disappears we immediately set the
5063 * overlap to 0. The effect of this is that all new
5064 * requests will be treated as if the image had no
5065 * parent.
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005066 *
5067 * If !pii.has_overlap, the parent image spec is not
5068 * applicable. It's there to avoid duplication in each
5069 * snapshot record.
Alex Elder392a9da2013-05-06 17:40:33 -05005070 */
5071 if (rbd_dev->parent_overlap) {
5072 rbd_dev->parent_overlap = 0;
Alex Elder392a9da2013-05-06 17:40:33 -05005073 rbd_dev_parent_put(rbd_dev);
5074 pr_info("%s: clone image has been flattened\n",
5075 rbd_dev->disk->disk_name);
5076 }
5077
Alex Elder86b00e02012-10-25 23:34:42 -05005078 goto out; /* No parent? No problem. */
Alex Elder392a9da2013-05-06 17:40:33 -05005079 }
Alex Elder86b00e02012-10-25 23:34:42 -05005080
Alex Elder0903e872012-11-14 12:25:19 -06005081 /* The ceph file layout needs to fit pool id in 32 bits */
5082
5083 ret = -EIO;
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005084 if (pii.pool_id > (u64)U32_MAX) {
Ilya Dryomov9584d502014-07-11 12:11:20 +04005085 rbd_warn(NULL, "parent pool id too large (%llu > %u)",
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005086 (unsigned long long)pii.pool_id, U32_MAX);
Alex Elder57385b52013-04-21 12:14:45 -05005087 goto out_err;
Alex Elderc0cd10db2013-04-26 09:43:47 -05005088 }
Alex Elder0903e872012-11-14 12:25:19 -06005089
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005090 /*
5091 * The parent won't change (except when the clone is
5092 * flattened, already handled that). So we only need to
5093 * record the parent spec we have not already done so.
5094 */
5095 if (!rbd_dev->parent_spec) {
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005096 parent_spec->pool_id = pii.pool_id;
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005097 if (pii.pool_ns && *pii.pool_ns) {
5098 parent_spec->pool_ns = pii.pool_ns;
5099 pii.pool_ns = NULL;
5100 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005101 parent_spec->image_id = pii.image_id;
5102 pii.image_id = NULL;
5103 parent_spec->snap_id = pii.snap_id;
Ilya Dryomovb26c0472018-07-03 15:28:43 +02005104
Alex Elder70cf49c2013-05-06 17:40:33 -05005105 rbd_dev->parent_spec = parent_spec;
5106 parent_spec = NULL; /* rbd_dev now owns this */
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005107 }
5108
5109 /*
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005110 * We always update the parent overlap. If it's zero we issue
5111 * a warning, as we will proceed as if there was no parent.
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005112 */
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005113 if (!pii.overlap) {
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005114 if (parent_spec) {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005115 /* refresh, careful to warn just once */
5116 if (rbd_dev->parent_overlap)
5117 rbd_warn(rbd_dev,
5118 "clone now standalone (overlap became 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005119 } else {
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005120 /* initial probe */
5121 rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
Alex Elder3b5cf2a2013-05-29 11:18:59 -05005122 }
Alex Elder70cf49c2013-05-06 17:40:33 -05005123 }
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005124 rbd_dev->parent_overlap = pii.overlap;
Ilya Dryomovcf32bd92015-01-19 22:57:39 +03005125
Alex Elder86b00e02012-10-25 23:34:42 -05005126out:
5127 ret = 0;
5128out_err:
Ilya Dryomove92c0ea2018-08-22 17:26:10 +02005129 kfree(pii.pool_ns);
Ilya Dryomoveb3b2d62018-08-22 17:11:27 +02005130 kfree(pii.image_id);
Alex Elder86b00e02012-10-25 23:34:42 -05005131 rbd_spec_put(parent_spec);
Alex Elder86b00e02012-10-25 23:34:42 -05005132 return ret;
5133}
5134
Alex Eldercc070d52013-04-21 12:14:45 -05005135static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev)
5136{
5137 struct {
5138 __le64 stripe_unit;
5139 __le64 stripe_count;
5140 } __attribute__ ((packed)) striping_info_buf = { 0 };
5141 size_t size = sizeof (striping_info_buf);
5142 void *p;
Alex Eldercc070d52013-04-21 12:14:45 -05005143 int ret;
5144
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005145 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5146 &rbd_dev->header_oloc, "get_stripe_unit_count",
5147 NULL, 0, &striping_info_buf, size);
Alex Eldercc070d52013-04-21 12:14:45 -05005148 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
5149 if (ret < 0)
5150 return ret;
5151 if (ret < size)
5152 return -ERANGE;
5153
Alex Eldercc070d52013-04-21 12:14:45 -05005154 p = &striping_info_buf;
Ilya Dryomovb1331852018-02-07 12:09:12 +01005155 rbd_dev->header.stripe_unit = ceph_decode_64(&p);
5156 rbd_dev->header.stripe_count = ceph_decode_64(&p);
Alex Eldercc070d52013-04-21 12:14:45 -05005157 return 0;
5158}
5159
Ilya Dryomov7e973322017-01-25 18:16:22 +01005160static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev)
5161{
5162 __le64 data_pool_id;
5163 int ret;
5164
5165 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5166 &rbd_dev->header_oloc, "get_data_pool",
5167 NULL, 0, &data_pool_id, sizeof(data_pool_id));
5168 if (ret < 0)
5169 return ret;
5170 if (ret < sizeof(data_pool_id))
5171 return -EBADMSG;
5172
5173 rbd_dev->header.data_pool_id = le64_to_cpu(data_pool_id);
5174 WARN_ON(rbd_dev->header.data_pool_id == CEPH_NOPOOL);
5175 return 0;
5176}
5177
Alex Elder9e15b772012-10-30 19:40:33 -05005178static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
5179{
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005180 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder9e15b772012-10-30 19:40:33 -05005181 size_t image_id_size;
5182 char *image_id;
5183 void *p;
5184 void *end;
5185 size_t size;
5186 void *reply_buf = NULL;
5187 size_t len = 0;
5188 char *image_name = NULL;
5189 int ret;
5190
5191 rbd_assert(!rbd_dev->spec->image_name);
5192
Alex Elder69e7a022012-11-01 08:39:26 -05005193 len = strlen(rbd_dev->spec->image_id);
5194 image_id_size = sizeof (__le32) + len;
Alex Elder9e15b772012-10-30 19:40:33 -05005195 image_id = kmalloc(image_id_size, GFP_KERNEL);
5196 if (!image_id)
5197 return NULL;
5198
5199 p = image_id;
Alex Elder41579762013-04-21 12:14:45 -05005200 end = image_id + image_id_size;
Alex Elder57385b52013-04-21 12:14:45 -05005201 ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
Alex Elder9e15b772012-10-30 19:40:33 -05005202
5203 size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
5204 reply_buf = kmalloc(size, GFP_KERNEL);
5205 if (!reply_buf)
5206 goto out;
5207
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005208 ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
5209 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5210 "dir_get_name", image_id, image_id_size,
5211 reply_buf, size);
Alex Elder9e15b772012-10-30 19:40:33 -05005212 if (ret < 0)
5213 goto out;
5214 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005215 end = reply_buf + ret;
5216
Alex Elder9e15b772012-10-30 19:40:33 -05005217 image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
5218 if (IS_ERR(image_name))
5219 image_name = NULL;
5220 else
5221 dout("%s: name is %s len is %zd\n", __func__, image_name, len);
5222out:
5223 kfree(reply_buf);
5224 kfree(image_id);
5225
5226 return image_name;
5227}
5228
Alex Elder2ad3d712013-04-30 00:44:33 -05005229static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5230{
5231 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5232 const char *snap_name;
5233 u32 which = 0;
5234
5235 /* Skip over names until we find the one we are looking for */
5236
5237 snap_name = rbd_dev->header.snap_names;
5238 while (which < snapc->num_snaps) {
5239 if (!strcmp(name, snap_name))
5240 return snapc->snaps[which];
5241 snap_name += strlen(snap_name) + 1;
5242 which++;
5243 }
5244 return CEPH_NOSNAP;
5245}
5246
5247static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5248{
5249 struct ceph_snap_context *snapc = rbd_dev->header.snapc;
5250 u32 which;
5251 bool found = false;
5252 u64 snap_id;
5253
5254 for (which = 0; !found && which < snapc->num_snaps; which++) {
5255 const char *snap_name;
5256
5257 snap_id = snapc->snaps[which];
5258 snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
Josh Durginefadc982013-08-29 19:16:42 -07005259 if (IS_ERR(snap_name)) {
5260 /* ignore no-longer existing snapshots */
5261 if (PTR_ERR(snap_name) == -ENOENT)
5262 continue;
5263 else
5264 break;
5265 }
Alex Elder2ad3d712013-04-30 00:44:33 -05005266 found = !strcmp(name, snap_name);
5267 kfree(snap_name);
5268 }
5269 return found ? snap_id : CEPH_NOSNAP;
5270}
5271
5272/*
5273 * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
5274 * no snapshot by that name is found, or if an error occurs.
5275 */
5276static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
5277{
5278 if (rbd_dev->image_format == 1)
5279 return rbd_v1_snap_id_by_name(rbd_dev, name);
5280
5281 return rbd_v2_snap_id_by_name(rbd_dev, name);
5282}
5283
Alex Elder9e15b772012-10-30 19:40:33 -05005284/*
Ilya Dryomov04077592014-07-23 17:11:20 +04005285 * An image being mapped will have everything but the snap id.
Alex Elder9e15b772012-10-30 19:40:33 -05005286 */
Ilya Dryomov04077592014-07-23 17:11:20 +04005287static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
5288{
5289 struct rbd_spec *spec = rbd_dev->spec;
5290
5291 rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
5292 rbd_assert(spec->image_id && spec->image_name);
5293 rbd_assert(spec->snap_name);
5294
5295 if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
5296 u64 snap_id;
5297
5298 snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
5299 if (snap_id == CEPH_NOSNAP)
5300 return -ENOENT;
5301
5302 spec->snap_id = snap_id;
5303 } else {
5304 spec->snap_id = CEPH_NOSNAP;
5305 }
5306
5307 return 0;
5308}
5309
5310/*
5311 * A parent image will have all ids but none of the names.
5312 *
5313 * All names in an rbd spec are dynamically allocated. It's OK if we
5314 * can't figure out the name for an image id.
5315 */
5316static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
Alex Elder9e15b772012-10-30 19:40:33 -05005317{
Alex Elder2e9f7f12013-04-26 09:43:48 -05005318 struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
5319 struct rbd_spec *spec = rbd_dev->spec;
5320 const char *pool_name;
5321 const char *image_name;
5322 const char *snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005323 int ret;
5324
Ilya Dryomov04077592014-07-23 17:11:20 +04005325 rbd_assert(spec->pool_id != CEPH_NOPOOL);
5326 rbd_assert(spec->image_id);
5327 rbd_assert(spec->snap_id != CEPH_NOSNAP);
Alex Elder9e15b772012-10-30 19:40:33 -05005328
Alex Elder2e9f7f12013-04-26 09:43:48 -05005329 /* Get the pool name; we have to make our own copy of this */
Alex Elder9e15b772012-10-30 19:40:33 -05005330
Alex Elder2e9f7f12013-04-26 09:43:48 -05005331 pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
5332 if (!pool_name) {
5333 rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
Alex Elder935dc892012-11-01 10:17:15 -05005334 return -EIO;
5335 }
Alex Elder2e9f7f12013-04-26 09:43:48 -05005336 pool_name = kstrdup(pool_name, GFP_KERNEL);
5337 if (!pool_name)
Alex Elder9e15b772012-10-30 19:40:33 -05005338 return -ENOMEM;
5339
5340 /* Fetch the image name; tolerate failure here */
5341
Alex Elder2e9f7f12013-04-26 09:43:48 -05005342 image_name = rbd_dev_image_name(rbd_dev);
5343 if (!image_name)
Alex Elder06ecc6c2012-11-01 10:17:15 -05005344 rbd_warn(rbd_dev, "unable to get image name");
Alex Elder9e15b772012-10-30 19:40:33 -05005345
Ilya Dryomov04077592014-07-23 17:11:20 +04005346 /* Fetch the snapshot name */
Alex Elder9e15b772012-10-30 19:40:33 -05005347
Alex Elder2e9f7f12013-04-26 09:43:48 -05005348 snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
Josh Durginda6a6b62013-09-04 17:57:31 -07005349 if (IS_ERR(snap_name)) {
5350 ret = PTR_ERR(snap_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005351 goto out_err;
Alex Elder2e9f7f12013-04-26 09:43:48 -05005352 }
5353
5354 spec->pool_name = pool_name;
5355 spec->image_name = image_name;
5356 spec->snap_name = snap_name;
Alex Elder9e15b772012-10-30 19:40:33 -05005357
5358 return 0;
Ilya Dryomov04077592014-07-23 17:11:20 +04005359
Alex Elder9e15b772012-10-30 19:40:33 -05005360out_err:
Alex Elder2e9f7f12013-04-26 09:43:48 -05005361 kfree(image_name);
5362 kfree(pool_name);
Alex Elder9e15b772012-10-30 19:40:33 -05005363 return ret;
5364}
5365
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005366static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev)
Alex Elder35d489f2012-07-03 16:01:19 -05005367{
5368 size_t size;
5369 int ret;
5370 void *reply_buf;
5371 void *p;
5372 void *end;
5373 u64 seq;
5374 u32 snap_count;
5375 struct ceph_snap_context *snapc;
5376 u32 i;
5377
5378 /*
5379 * We'll need room for the seq value (maximum snapshot id),
5380 * snapshot count, and array of that many snapshot ids.
5381 * For now we have a fixed upper limit on the number we're
5382 * prepared to receive.
5383 */
5384 size = sizeof (__le64) + sizeof (__le32) +
5385 RBD_MAX_SNAP_COUNT * sizeof (__le64);
5386 reply_buf = kzalloc(size, GFP_KERNEL);
5387 if (!reply_buf)
5388 return -ENOMEM;
5389
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005390 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5391 &rbd_dev->header_oloc, "get_snapcontext",
5392 NULL, 0, reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005393 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elder35d489f2012-07-03 16:01:19 -05005394 if (ret < 0)
5395 goto out;
5396
Alex Elder35d489f2012-07-03 16:01:19 -05005397 p = reply_buf;
Alex Elder57385b52013-04-21 12:14:45 -05005398 end = reply_buf + ret;
5399 ret = -ERANGE;
Alex Elder35d489f2012-07-03 16:01:19 -05005400 ceph_decode_64_safe(&p, end, seq, out);
5401 ceph_decode_32_safe(&p, end, snap_count, out);
5402
5403 /*
5404 * Make sure the reported number of snapshot ids wouldn't go
5405 * beyond the end of our buffer. But before checking that,
5406 * make sure the computed size of the snapshot context we
5407 * allocate is representable in a size_t.
5408 */
5409 if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
5410 / sizeof (u64)) {
5411 ret = -EINVAL;
5412 goto out;
5413 }
5414 if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
5415 goto out;
Alex Elder468521c2013-04-26 09:43:47 -05005416 ret = 0;
Alex Elder35d489f2012-07-03 16:01:19 -05005417
Alex Elder812164f82013-04-30 00:44:32 -05005418 snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
Alex Elder35d489f2012-07-03 16:01:19 -05005419 if (!snapc) {
5420 ret = -ENOMEM;
5421 goto out;
5422 }
Alex Elder35d489f2012-07-03 16:01:19 -05005423 snapc->seq = seq;
Alex Elder35d489f2012-07-03 16:01:19 -05005424 for (i = 0; i < snap_count; i++)
5425 snapc->snaps[i] = ceph_decode_64(&p);
5426
Alex Elder49ece552013-05-06 08:37:00 -05005427 ceph_put_snap_context(rbd_dev->header.snapc);
Alex Elder35d489f2012-07-03 16:01:19 -05005428 rbd_dev->header.snapc = snapc;
5429
5430 dout(" snap context seq = %llu, snap_count = %u\n",
Alex Elder57385b52013-04-21 12:14:45 -05005431 (unsigned long long)seq, (unsigned int)snap_count);
Alex Elder35d489f2012-07-03 16:01:19 -05005432out:
5433 kfree(reply_buf);
5434
Alex Elder57385b52013-04-21 12:14:45 -05005435 return ret;
Alex Elder35d489f2012-07-03 16:01:19 -05005436}
5437
Alex Elder54cac612013-04-30 00:44:33 -05005438static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
5439 u64 snap_id)
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005440{
5441 size_t size;
5442 void *reply_buf;
Alex Elder54cac612013-04-30 00:44:33 -05005443 __le64 snapid;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005444 int ret;
5445 void *p;
5446 void *end;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005447 char *snap_name;
5448
5449 size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
5450 reply_buf = kmalloc(size, GFP_KERNEL);
5451 if (!reply_buf)
5452 return ERR_PTR(-ENOMEM);
5453
Alex Elder54cac612013-04-30 00:44:33 -05005454 snapid = cpu_to_le64(snap_id);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005455 ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
5456 &rbd_dev->header_oloc, "get_snapshot_name",
5457 &snapid, sizeof(snapid), reply_buf, size);
Alex Elder36be9a72013-01-19 00:30:28 -06005458 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderf40eb342013-04-25 15:09:42 -05005459 if (ret < 0) {
5460 snap_name = ERR_PTR(ret);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005461 goto out;
Alex Elderf40eb342013-04-25 15:09:42 -05005462 }
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005463
5464 p = reply_buf;
Alex Elderf40eb342013-04-25 15:09:42 -05005465 end = reply_buf + ret;
Alex Eldere5c35532012-10-25 23:34:41 -05005466 snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
Alex Elderf40eb342013-04-25 15:09:42 -05005467 if (IS_ERR(snap_name))
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005468 goto out;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005469
Alex Elderf40eb342013-04-25 15:09:42 -05005470 dout(" snap_id 0x%016llx snap_name = %s\n",
Alex Elder54cac612013-04-30 00:44:33 -05005471 (unsigned long long)snap_id, snap_name);
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005472out:
5473 kfree(reply_buf);
5474
Alex Elderf40eb342013-04-25 15:09:42 -05005475 return snap_name;
Alex Elderb8b1e2d2012-07-03 16:01:19 -05005476}
5477
Alex Elder2df3fac2013-05-06 09:51:30 -05005478static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev)
Alex Elder117973f2012-08-31 17:29:55 -05005479{
Alex Elder2df3fac2013-05-06 09:51:30 -05005480 bool first_time = rbd_dev->header.object_prefix == NULL;
Alex Elder117973f2012-08-31 17:29:55 -05005481 int ret;
Alex Elder117973f2012-08-31 17:29:55 -05005482
Josh Durgin1617e402013-06-12 14:43:10 -07005483 ret = rbd_dev_v2_image_size(rbd_dev);
5484 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005485 return ret;
Josh Durgin1617e402013-06-12 14:43:10 -07005486
Alex Elder2df3fac2013-05-06 09:51:30 -05005487 if (first_time) {
5488 ret = rbd_dev_v2_header_onetime(rbd_dev);
5489 if (ret)
Alex Eldercfbf6372013-05-31 17:40:45 -05005490 return ret;
Alex Elder2df3fac2013-05-06 09:51:30 -05005491 }
5492
Alex Eldercc4a38bd2013-04-30 00:44:33 -05005493 ret = rbd_dev_v2_snap_context(rbd_dev);
Ilya Dryomovd194cd12015-08-31 18:22:10 +03005494 if (ret && first_time) {
5495 kfree(rbd_dev->header.object_prefix);
5496 rbd_dev->header.object_prefix = NULL;
5497 }
Alex Elder117973f2012-08-31 17:29:55 -05005498
5499 return ret;
5500}
5501
Ilya Dryomova720ae02014-07-23 17:11:19 +04005502static int rbd_dev_header_info(struct rbd_device *rbd_dev)
5503{
5504 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
5505
5506 if (rbd_dev->image_format == 1)
5507 return rbd_dev_v1_header_info(rbd_dev);
5508
5509 return rbd_dev_v2_header_info(rbd_dev);
5510}
5511
Alex Elder1ddbe942012-01-29 13:57:44 -06005512/*
Alex Eldere28fff262012-02-02 08:13:30 -06005513 * Skips over white space at *buf, and updates *buf to point to the
5514 * first found non-space character (if any). Returns the length of
Alex Elder593a9e72012-02-07 12:03:37 -06005515 * the token (string of non-white space characters) found. Note
5516 * that *buf must be terminated with '\0'.
Alex Eldere28fff262012-02-02 08:13:30 -06005517 */
5518static inline size_t next_token(const char **buf)
5519{
5520 /*
5521 * These are the characters that produce nonzero for
5522 * isspace() in the "C" and "POSIX" locales.
5523 */
5524 const char *spaces = " \f\n\r\t\v";
5525
5526 *buf += strspn(*buf, spaces); /* Find start of token */
5527
5528 return strcspn(*buf, spaces); /* Return token length */
5529}
5530
5531/*
Alex Elderea3352f2012-07-09 21:04:23 -05005532 * Finds the next token in *buf, dynamically allocates a buffer big
5533 * enough to hold a copy of it, and copies the token into the new
5534 * buffer. The copy is guaranteed to be terminated with '\0'. Note
5535 * that a duplicate buffer is created even for a zero-length token.
5536 *
5537 * Returns a pointer to the newly-allocated duplicate, or a null
5538 * pointer if memory for the duplicate was not available. If
5539 * the lenp argument is a non-null pointer, the length of the token
5540 * (not including the '\0') is returned in *lenp.
5541 *
5542 * If successful, the *buf pointer will be updated to point beyond
5543 * the end of the found token.
5544 *
5545 * Note: uses GFP_KERNEL for allocation.
5546 */
5547static inline char *dup_token(const char **buf, size_t *lenp)
5548{
5549 char *dup;
5550 size_t len;
5551
5552 len = next_token(buf);
Alex Elder4caf35f2012-11-01 08:39:27 -05005553 dup = kmemdup(*buf, len + 1, GFP_KERNEL);
Alex Elderea3352f2012-07-09 21:04:23 -05005554 if (!dup)
5555 return NULL;
Alex Elderea3352f2012-07-09 21:04:23 -05005556 *(dup + len) = '\0';
5557 *buf += len;
5558
5559 if (lenp)
5560 *lenp = len;
5561
5562 return dup;
5563}
5564
5565/*
Alex Elder859c31d2012-10-25 23:34:42 -05005566 * Parse the options provided for an "rbd add" (i.e., rbd image
5567 * mapping) request. These arrive via a write to /sys/bus/rbd/add,
5568 * and the data written is passed here via a NUL-terminated buffer.
5569 * Returns 0 if successful or an error code otherwise.
Alex Elderd22f76e2012-07-12 10:46:35 -05005570 *
Alex Elder859c31d2012-10-25 23:34:42 -05005571 * The information extracted from these options is recorded in
5572 * the other parameters which return dynamically-allocated
5573 * structures:
5574 * ceph_opts
5575 * The address of a pointer that will refer to a ceph options
5576 * structure. Caller must release the returned pointer using
5577 * ceph_destroy_options() when it is no longer needed.
5578 * rbd_opts
5579 * Address of an rbd options pointer. Fully initialized by
5580 * this function; caller must release with kfree().
5581 * spec
5582 * Address of an rbd image specification pointer. Fully
5583 * initialized by this function based on parsed options.
5584 * Caller must release with rbd_spec_put().
5585 *
5586 * The options passed take this form:
5587 * <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
5588 * where:
5589 * <mon_addrs>
5590 * A comma-separated list of one or more monitor addresses.
5591 * A monitor address is an ip address, optionally followed
5592 * by a port number (separated by a colon).
5593 * I.e.: ip1[:port1][,ip2[:port2]...]
5594 * <options>
5595 * A comma-separated list of ceph and/or rbd options.
5596 * <pool_name>
5597 * The name of the rados pool containing the rbd image.
5598 * <image_name>
5599 * The name of the image in that pool to map.
5600 * <snap_id>
5601 * An optional snapshot id. If provided, the mapping will
5602 * present data from the image at the time that snapshot was
5603 * created. The image head is used if no snapshot id is
5604 * provided. Snapshot mappings are always read-only.
Alex Eldera725f65e2012-02-02 08:13:30 -06005605 */
Alex Elder859c31d2012-10-25 23:34:42 -05005606static int rbd_add_parse_args(const char *buf,
Alex Elderdc79b112012-10-25 23:34:41 -05005607 struct ceph_options **ceph_opts,
Alex Elder859c31d2012-10-25 23:34:42 -05005608 struct rbd_options **opts,
5609 struct rbd_spec **rbd_spec)
Alex Eldera725f65e2012-02-02 08:13:30 -06005610{
Alex Elderd22f76e2012-07-12 10:46:35 -05005611 size_t len;
Alex Elder859c31d2012-10-25 23:34:42 -05005612 char *options;
Alex Elder0ddebc02012-10-25 23:34:41 -05005613 const char *mon_addrs;
Alex Elderecb4dc222013-04-26 09:43:47 -05005614 char *snap_name;
Alex Elder0ddebc02012-10-25 23:34:41 -05005615 size_t mon_addrs_size;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005616 struct parse_rbd_opts_ctx pctx = { 0 };
Alex Elder859c31d2012-10-25 23:34:42 -05005617 struct ceph_options *copts;
Alex Elderdc79b112012-10-25 23:34:41 -05005618 int ret;
Alex Eldere28fff262012-02-02 08:13:30 -06005619
5620 /* The first four tokens are required */
5621
Alex Elder7ef32142012-02-02 08:13:30 -06005622 len = next_token(&buf);
Alex Elder4fb5d6712012-11-01 10:17:15 -05005623 if (!len) {
5624 rbd_warn(NULL, "no monitor address(es) provided");
5625 return -EINVAL;
5626 }
Alex Elder0ddebc02012-10-25 23:34:41 -05005627 mon_addrs = buf;
Alex Elderf28e5652012-10-25 23:34:41 -05005628 mon_addrs_size = len + 1;
Alex Elder7ef32142012-02-02 08:13:30 -06005629 buf += len;
Alex Eldera725f65e2012-02-02 08:13:30 -06005630
Alex Elderdc79b112012-10-25 23:34:41 -05005631 ret = -EINVAL;
Alex Elderf28e5652012-10-25 23:34:41 -05005632 options = dup_token(&buf, NULL);
5633 if (!options)
Alex Elderdc79b112012-10-25 23:34:41 -05005634 return -ENOMEM;
Alex Elder4fb5d6712012-11-01 10:17:15 -05005635 if (!*options) {
5636 rbd_warn(NULL, "no options provided");
5637 goto out_err;
5638 }
Alex Eldera725f65e2012-02-02 08:13:30 -06005639
Ilya Dryomovc3001562018-07-03 15:28:43 +02005640 pctx.spec = rbd_spec_alloc();
5641 if (!pctx.spec)
Alex Elderf28e5652012-10-25 23:34:41 -05005642 goto out_mem;
Alex Elder859c31d2012-10-25 23:34:42 -05005643
Ilya Dryomovc3001562018-07-03 15:28:43 +02005644 pctx.spec->pool_name = dup_token(&buf, NULL);
5645 if (!pctx.spec->pool_name)
Alex Elder859c31d2012-10-25 23:34:42 -05005646 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005647 if (!*pctx.spec->pool_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005648 rbd_warn(NULL, "no pool name provided");
5649 goto out_err;
5650 }
Alex Eldere28fff262012-02-02 08:13:30 -06005651
Ilya Dryomovc3001562018-07-03 15:28:43 +02005652 pctx.spec->image_name = dup_token(&buf, NULL);
5653 if (!pctx.spec->image_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005654 goto out_mem;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005655 if (!*pctx.spec->image_name) {
Alex Elder4fb5d6712012-11-01 10:17:15 -05005656 rbd_warn(NULL, "no image name provided");
5657 goto out_err;
5658 }
Alex Eldere28fff262012-02-02 08:13:30 -06005659
Alex Elderf28e5652012-10-25 23:34:41 -05005660 /*
5661 * Snapshot name is optional; default is to use "-"
5662 * (indicating the head/no snapshot).
5663 */
Alex Elder3feeb8942012-08-31 17:29:52 -05005664 len = next_token(&buf);
Alex Elder820a5f32012-07-09 21:04:24 -05005665 if (!len) {
Alex Elder3feeb8942012-08-31 17:29:52 -05005666 buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
5667 len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
Alex Elderf28e5652012-10-25 23:34:41 -05005668 } else if (len > RBD_MAX_SNAP_NAME_LEN) {
Alex Elderdc79b112012-10-25 23:34:41 -05005669 ret = -ENAMETOOLONG;
Alex Elderf28e5652012-10-25 23:34:41 -05005670 goto out_err;
Alex Elder849b4262012-07-09 21:04:24 -05005671 }
Alex Elderecb4dc222013-04-26 09:43:47 -05005672 snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
5673 if (!snap_name)
Alex Elderf28e5652012-10-25 23:34:41 -05005674 goto out_mem;
Alex Elderecb4dc222013-04-26 09:43:47 -05005675 *(snap_name + len) = '\0';
Ilya Dryomovc3001562018-07-03 15:28:43 +02005676 pctx.spec->snap_name = snap_name;
Alex Eldere5c35532012-10-25 23:34:41 -05005677
Alex Elder0ddebc02012-10-25 23:34:41 -05005678 /* Initialize all rbd options to the defaults */
Alex Eldere28fff262012-02-02 08:13:30 -06005679
Ilya Dryomovc3001562018-07-03 15:28:43 +02005680 pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
5681 if (!pctx.opts)
Alex Elder4e9afeb2012-10-25 23:34:41 -05005682 goto out_mem;
5683
Ilya Dryomovc3001562018-07-03 15:28:43 +02005684 pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
5685 pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01005686 pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005687 pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
5688 pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
5689 pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
5690 pctx.opts->trim = RBD_TRIM_DEFAULT;
Alex Elderd22f76e2012-07-12 10:46:35 -05005691
Alex Elder859c31d2012-10-25 23:34:42 -05005692 copts = ceph_parse_options(options, mon_addrs,
Ilya Dryomovc3001562018-07-03 15:28:43 +02005693 mon_addrs + mon_addrs_size - 1,
5694 parse_rbd_opts_token, &pctx);
Alex Elder859c31d2012-10-25 23:34:42 -05005695 if (IS_ERR(copts)) {
5696 ret = PTR_ERR(copts);
Alex Elderdc79b112012-10-25 23:34:41 -05005697 goto out_err;
5698 }
Alex Elder859c31d2012-10-25 23:34:42 -05005699 kfree(options);
5700
5701 *ceph_opts = copts;
Ilya Dryomovc3001562018-07-03 15:28:43 +02005702 *opts = pctx.opts;
5703 *rbd_spec = pctx.spec;
Alex Elder0ddebc02012-10-25 23:34:41 -05005704
Alex Elderdc79b112012-10-25 23:34:41 -05005705 return 0;
Alex Elderf28e5652012-10-25 23:34:41 -05005706out_mem:
Alex Elderdc79b112012-10-25 23:34:41 -05005707 ret = -ENOMEM;
Alex Elderd22f76e2012-07-12 10:46:35 -05005708out_err:
Ilya Dryomovc3001562018-07-03 15:28:43 +02005709 kfree(pctx.opts);
5710 rbd_spec_put(pctx.spec);
Alex Elderf28e5652012-10-25 23:34:41 -05005711 kfree(options);
Alex Elderd22f76e2012-07-12 10:46:35 -05005712
Alex Elderdc79b112012-10-25 23:34:41 -05005713 return ret;
Alex Eldera725f65e2012-02-02 08:13:30 -06005714}
5715
Ilya Dryomove010dd02017-04-13 12:17:39 +02005716static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
5717{
5718 down_write(&rbd_dev->lock_rwsem);
5719 if (__rbd_is_lock_owner(rbd_dev))
5720 rbd_unlock(rbd_dev);
5721 up_write(&rbd_dev->lock_rwsem);
5722}
5723
5724static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
5725{
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005726 int ret;
5727
Ilya Dryomove010dd02017-04-13 12:17:39 +02005728 if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
5729 rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
5730 return -EINVAL;
5731 }
5732
5733 /* FIXME: "rbd map --exclusive" should be in interruptible */
5734 down_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005735 ret = rbd_wait_state_locked(rbd_dev, true);
Ilya Dryomove010dd02017-04-13 12:17:39 +02005736 up_read(&rbd_dev->lock_rwsem);
Ilya Dryomov2f18d462018-04-04 10:15:38 +02005737 if (ret) {
Ilya Dryomove010dd02017-04-13 12:17:39 +02005738 rbd_warn(rbd_dev, "failed to acquire exclusive lock");
5739 return -EROFS;
5740 }
5741
5742 return 0;
5743}
5744
Ilya Dryomov30ba1f02014-05-13 11:19:27 +04005745/*
Alex Elder589d30e2012-07-10 20:30:11 -05005746 * An rbd format 2 image has a unique identifier, distinct from the
5747 * name given to it by the user. Internally, that identifier is
5748 * what's used to specify the names of objects related to the image.
5749 *
5750 * A special "rbd id" object is used to map an rbd image name to its
5751 * id. If that object doesn't exist, then there is no v2 rbd image
5752 * with the supplied name.
5753 *
5754 * This function will record the given rbd_dev's image_id field if
5755 * it can be determined, and in that case will return 0. If any
5756 * errors occur a negative errno will be returned and the rbd_dev's
5757 * image_id field will be unchanged (and should be NULL).
5758 */
5759static int rbd_dev_image_id(struct rbd_device *rbd_dev)
5760{
5761 int ret;
5762 size_t size;
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005763 CEPH_DEFINE_OID_ONSTACK(oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005764 void *response;
Alex Elderc0fba362013-04-25 23:15:08 -05005765 char *image_id;
Alex Elder2f82ee52012-10-30 19:40:33 -05005766
Alex Elder589d30e2012-07-10 20:30:11 -05005767 /*
Alex Elder2c0d0a12012-10-30 19:40:33 -05005768 * When probing a parent image, the image id is already
5769 * known (and the image name likely is not). There's no
Alex Elderc0fba362013-04-25 23:15:08 -05005770 * need to fetch the image id again in this case. We
5771 * do still need to set the image format though.
Alex Elder2c0d0a12012-10-30 19:40:33 -05005772 */
Alex Elderc0fba362013-04-25 23:15:08 -05005773 if (rbd_dev->spec->image_id) {
5774 rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
5775
Alex Elder2c0d0a12012-10-30 19:40:33 -05005776 return 0;
Alex Elderc0fba362013-04-25 23:15:08 -05005777 }
Alex Elder2c0d0a12012-10-30 19:40:33 -05005778
5779 /*
Alex Elder589d30e2012-07-10 20:30:11 -05005780 * First, see if the format 2 image id file exists, and if
5781 * so, get the image's persistent id from it.
5782 */
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005783 ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
5784 rbd_dev->spec->image_name);
5785 if (ret)
5786 return ret;
5787
5788 dout("rbd id object name is %s\n", oid.name);
Alex Elder589d30e2012-07-10 20:30:11 -05005789
5790 /* Response will be an encoded string, which includes a length */
5791
5792 size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
5793 response = kzalloc(size, GFP_NOIO);
5794 if (!response) {
5795 ret = -ENOMEM;
5796 goto out;
5797 }
5798
Alex Elderc0fba362013-04-25 23:15:08 -05005799 /* If it doesn't exist we'll assume it's a format 1 image */
5800
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005801 ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
5802 "get_id", NULL, 0,
5803 response, RBD_IMAGE_ID_LEN_MAX);
Alex Elder36be9a72013-01-19 00:30:28 -06005804 dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
Alex Elderc0fba362013-04-25 23:15:08 -05005805 if (ret == -ENOENT) {
5806 image_id = kstrdup("", GFP_KERNEL);
5807 ret = image_id ? 0 : -ENOMEM;
5808 if (!ret)
5809 rbd_dev->image_format = 1;
Ilya Dryomov7dd440c2014-09-11 18:49:18 +04005810 } else if (ret >= 0) {
Alex Elderc0fba362013-04-25 23:15:08 -05005811 void *p = response;
Alex Elder589d30e2012-07-10 20:30:11 -05005812
Alex Elderc0fba362013-04-25 23:15:08 -05005813 image_id = ceph_extract_encoded_string(&p, p + ret,
Alex Elder979ed482012-11-01 08:39:26 -05005814 NULL, GFP_NOIO);
Duan Jiong461f7582014-04-11 16:38:12 +08005815 ret = PTR_ERR_OR_ZERO(image_id);
Alex Elderc0fba362013-04-25 23:15:08 -05005816 if (!ret)
5817 rbd_dev->image_format = 2;
Alex Elderc0fba362013-04-25 23:15:08 -05005818 }
5819
5820 if (!ret) {
5821 rbd_dev->spec->image_id = image_id;
5822 dout("image_id is %s\n", image_id);
Alex Elder589d30e2012-07-10 20:30:11 -05005823 }
5824out:
5825 kfree(response);
Ilya Dryomovecd4a682017-01-25 18:16:21 +01005826 ceph_oid_destroy(&oid);
Alex Elder589d30e2012-07-10 20:30:11 -05005827 return ret;
5828}
5829
Alex Elder3abef3b2013-05-13 20:35:37 -05005830/*
5831 * Undo whatever state changes are made by v1 or v2 header info
5832 * call.
5833 */
Alex Elder6fd48b32013-04-28 23:32:34 -05005834static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
5835{
5836 struct rbd_image_header *header;
5837
Ilya Dryomove69b8d42015-01-19 12:06:14 +03005838 rbd_dev_parent_put(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05005839
5840 /* Free dynamic fields from the header, then zero it out */
5841
5842 header = &rbd_dev->header;
Alex Elder812164f82013-04-30 00:44:32 -05005843 ceph_put_snap_context(header->snapc);
Alex Elder6fd48b32013-04-28 23:32:34 -05005844 kfree(header->snap_sizes);
5845 kfree(header->snap_names);
5846 kfree(header->object_prefix);
5847 memset(header, 0, sizeof (*header));
5848}
5849
Alex Elder2df3fac2013-05-06 09:51:30 -05005850static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev)
Alex Eldera30b71b2012-07-10 20:30:11 -05005851{
5852 int ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005853
Alex Elder1e130192012-07-03 16:01:19 -05005854 ret = rbd_dev_v2_object_prefix(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005855 if (ret)
Alex Elder1e130192012-07-03 16:01:19 -05005856 goto out_err;
Alex Elderb1b54022012-07-03 16:01:19 -05005857
Alex Elder2df3fac2013-05-06 09:51:30 -05005858 /*
5859 * Get the and check features for the image. Currently the
5860 * features are assumed to never change.
5861 */
Alex Elderb1b54022012-07-03 16:01:19 -05005862 ret = rbd_dev_v2_features(rbd_dev);
Alex Elder57385b52013-04-21 12:14:45 -05005863 if (ret)
Alex Elderb1b54022012-07-03 16:01:19 -05005864 goto out_err;
Alex Elder35d489f2012-07-03 16:01:19 -05005865
Alex Eldercc070d52013-04-21 12:14:45 -05005866 /* If the image supports fancy striping, get its parameters */
5867
5868 if (rbd_dev->header.features & RBD_FEATURE_STRIPINGV2) {
5869 ret = rbd_dev_v2_striping_info(rbd_dev);
5870 if (ret < 0)
5871 goto out_err;
5872 }
Alex Eldera30b71b2012-07-10 20:30:11 -05005873
Ilya Dryomov7e973322017-01-25 18:16:22 +01005874 if (rbd_dev->header.features & RBD_FEATURE_DATA_POOL) {
5875 ret = rbd_dev_v2_data_pool(rbd_dev);
5876 if (ret)
5877 goto out_err;
5878 }
5879
Ilya Dryomov263423f2017-01-25 18:16:22 +01005880 rbd_init_layout(rbd_dev);
Alex Elder35152972012-08-31 17:29:55 -05005881 return 0;
Ilya Dryomov263423f2017-01-25 18:16:22 +01005882
Alex Elder9d475de2012-07-03 16:01:19 -05005883out_err:
Alex Elder642a2532013-05-06 17:40:33 -05005884 rbd_dev->header.features = 0;
Alex Elder1e130192012-07-03 16:01:19 -05005885 kfree(rbd_dev->header.object_prefix);
5886 rbd_dev->header.object_prefix = NULL;
Alex Elder9d475de2012-07-03 16:01:19 -05005887 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05005888}
5889
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005890/*
5891 * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
5892 * rbd_dev_image_probe() recursion depth, which means it's also the
5893 * length of the already discovered part of the parent chain.
5894 */
5895static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
Alex Elder83a06262012-10-30 15:47:17 -05005896{
Alex Elder2f82ee52012-10-30 19:40:33 -05005897 struct rbd_device *parent = NULL;
Alex Elder124afba2013-04-26 15:44:36 -05005898 int ret;
5899
5900 if (!rbd_dev->parent_spec)
5901 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005902
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005903 if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
5904 pr_info("parent chain is too long (%d)\n", depth);
5905 ret = -EINVAL;
5906 goto out_err;
5907 }
5908
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005909 parent = __rbd_dev_create(rbd_dev->rbd_client, rbd_dev->parent_spec);
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005910 if (!parent) {
5911 ret = -ENOMEM;
Alex Elder124afba2013-04-26 15:44:36 -05005912 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005913 }
5914
5915 /*
5916 * Images related by parent/child relationships always share
5917 * rbd_client and spec/parent_spec, so bump their refcounts.
5918 */
5919 __rbd_get_client(rbd_dev->rbd_client);
5920 rbd_spec_get(rbd_dev->parent_spec);
Alex Elder124afba2013-04-26 15:44:36 -05005921
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02005922 ret = rbd_dev_image_probe(parent, depth);
Alex Elder124afba2013-04-26 15:44:36 -05005923 if (ret < 0)
5924 goto out_err;
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005925
Alex Elder124afba2013-04-26 15:44:36 -05005926 rbd_dev->parent = parent;
Alex Eldera2acd002013-05-08 22:50:04 -05005927 atomic_set(&rbd_dev->parent_ref, 1);
Alex Elder124afba2013-04-26 15:44:36 -05005928 return 0;
Alex Elder124afba2013-04-26 15:44:36 -05005929
Ilya Dryomov1f2c6652015-10-11 19:38:00 +02005930out_err:
5931 rbd_dev_unparent(rbd_dev);
Markus Elfring1761b222015-11-23 20:16:45 +01005932 rbd_dev_destroy(parent);
Alex Elder124afba2013-04-26 15:44:36 -05005933 return ret;
5934}
5935
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005936static void rbd_dev_device_release(struct rbd_device *rbd_dev)
5937{
5938 clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
5939 rbd_dev_mapping_clear(rbd_dev);
5940 rbd_free_disk(rbd_dev);
5941 if (!single_major)
5942 unregister_blkdev(rbd_dev->major, rbd_dev->name);
5943}
5944
Ilya Dryomov811c6682016-04-15 16:22:16 +02005945/*
5946 * rbd_dev->header_rwsem must be locked for write and will be unlocked
5947 * upon return.
5948 */
Alex Elder200a6a82013-04-28 23:32:34 -05005949static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
Alex Elder124afba2013-04-26 15:44:36 -05005950{
Alex Elder83a06262012-10-30 15:47:17 -05005951 int ret;
Alex Elder83a06262012-10-30 15:47:17 -05005952
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005953 /* Record our major and minor device numbers. */
Alex Elder83a06262012-10-30 15:47:17 -05005954
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005955 if (!single_major) {
5956 ret = register_blkdev(0, rbd_dev->name);
5957 if (ret < 0)
Ilya Dryomov1643dfa2016-08-12 15:45:52 +02005958 goto err_out_unlock;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005959
5960 rbd_dev->major = ret;
5961 rbd_dev->minor = 0;
5962 } else {
5963 rbd_dev->major = rbd_major;
5964 rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
5965 }
Alex Elder83a06262012-10-30 15:47:17 -05005966
5967 /* Set up the blkdev mapping. */
5968
5969 ret = rbd_init_disk(rbd_dev);
5970 if (ret)
5971 goto err_out_blkdev;
5972
Alex Elderf35a4de2013-05-06 09:51:29 -05005973 ret = rbd_dev_mapping_set(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005974 if (ret)
5975 goto err_out_disk;
Ilya Dryomovbc1ecc62014-08-04 18:04:39 +04005976
Alex Elderf35a4de2013-05-06 09:51:29 -05005977 set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
Ilya Dryomov9568c932017-10-12 12:35:19 +02005978 set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only);
Alex Elderf35a4de2013-05-06 09:51:29 -05005979
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005980 ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
Alex Elderf35a4de2013-05-06 09:51:29 -05005981 if (ret)
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04005982 goto err_out_mapping;
Alex Elder83a06262012-10-30 15:47:17 -05005983
Alex Elder129b79d2013-04-26 15:44:36 -05005984 set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005985 up_write(&rbd_dev->header_rwsem);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02005986 return 0;
Alex Elder2f82ee52012-10-30 19:40:33 -05005987
Alex Elderf35a4de2013-05-06 09:51:29 -05005988err_out_mapping:
5989 rbd_dev_mapping_clear(rbd_dev);
Alex Elder83a06262012-10-30 15:47:17 -05005990err_out_disk:
5991 rbd_free_disk(rbd_dev);
5992err_out_blkdev:
Ilya Dryomov9b60e702013-12-13 15:28:57 +02005993 if (!single_major)
5994 unregister_blkdev(rbd_dev->major, rbd_dev->name);
Ilya Dryomov811c6682016-04-15 16:22:16 +02005995err_out_unlock:
5996 up_write(&rbd_dev->header_rwsem);
Alex Elder83a06262012-10-30 15:47:17 -05005997 return ret;
5998}
5999
Alex Elder332bb122013-04-27 09:59:30 -05006000static int rbd_dev_header_name(struct rbd_device *rbd_dev)
6001{
6002 struct rbd_spec *spec = rbd_dev->spec;
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006003 int ret;
Alex Elder332bb122013-04-27 09:59:30 -05006004
6005 /* Record the header object name for this rbd image. */
6006
6007 rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
Alex Elder332bb122013-04-27 09:59:30 -05006008 if (rbd_dev->image_format == 1)
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006009 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6010 spec->image_name, RBD_SUFFIX);
Alex Elder332bb122013-04-27 09:59:30 -05006011 else
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006012 ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
6013 RBD_HEADER_PREFIX, spec->image_id);
Alex Elder332bb122013-04-27 09:59:30 -05006014
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006015 return ret;
Alex Elder332bb122013-04-27 09:59:30 -05006016}
6017
Alex Elder200a6a82013-04-28 23:32:34 -05006018static void rbd_dev_image_release(struct rbd_device *rbd_dev)
6019{
Alex Elder6fd48b32013-04-28 23:32:34 -05006020 rbd_dev_unprobe(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006021 if (rbd_dev->opts)
6022 rbd_unregister_watch(rbd_dev);
Alex Elder6fd48b32013-04-28 23:32:34 -05006023 rbd_dev->image_format = 0;
6024 kfree(rbd_dev->spec->image_id);
6025 rbd_dev->spec->image_id = NULL;
Alex Elder200a6a82013-04-28 23:32:34 -05006026}
6027
Alex Eldera30b71b2012-07-10 20:30:11 -05006028/*
6029 * Probe for the existence of the header object for the given rbd
Alex Elder1f3ef782013-05-06 17:40:33 -05006030 * device. If this image is the one being mapped (i.e., not a
6031 * parent), initiate a watch on its header object before using that
6032 * object to get detailed information about the rbd image.
Alex Eldera30b71b2012-07-10 20:30:11 -05006033 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006034static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
Alex Eldera30b71b2012-07-10 20:30:11 -05006035{
6036 int ret;
6037
6038 /*
Alex Elder3abef3b2013-05-13 20:35:37 -05006039 * Get the id from the image id object. Unless there's an
6040 * error, rbd_dev->spec->image_id will be filled in with
6041 * a dynamically-allocated string, and rbd_dev->image_format
6042 * will be set to either 1 or 2.
Alex Eldera30b71b2012-07-10 20:30:11 -05006043 */
6044 ret = rbd_dev_image_id(rbd_dev);
6045 if (ret)
Alex Elderc0fba362013-04-25 23:15:08 -05006046 return ret;
Alex Elderc0fba362013-04-25 23:15:08 -05006047
Alex Elder332bb122013-04-27 09:59:30 -05006048 ret = rbd_dev_header_name(rbd_dev);
6049 if (ret)
6050 goto err_out_format;
6051
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006052 if (!depth) {
Ilya Dryomov99d16942016-08-12 16:11:41 +02006053 ret = rbd_register_watch(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006054 if (ret) {
6055 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006056 pr_info("image %s/%s%s%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006057 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006058 rbd_dev->spec->pool_ns ?: "",
6059 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006060 rbd_dev->spec->image_name);
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006061 goto err_out_format;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006062 }
Alex Elder1f3ef782013-05-06 17:40:33 -05006063 }
Alex Elderb644de22013-04-27 09:59:31 -05006064
Ilya Dryomova720ae02014-07-23 17:11:19 +04006065 ret = rbd_dev_header_info(rbd_dev);
Alex Elder5655c4d2013-04-25 23:15:08 -05006066 if (ret)
Alex Elderb644de22013-04-27 09:59:31 -05006067 goto err_out_watch;
Alex Elder83a06262012-10-30 15:47:17 -05006068
Ilya Dryomov04077592014-07-23 17:11:20 +04006069 /*
6070 * If this image is the one being mapped, we have pool name and
6071 * id, image name and id, and snap name - need to fill snap id.
6072 * Otherwise this is a parent image, identified by pool, image
6073 * and snap ids - need to fill in names for those ids.
6074 */
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006075 if (!depth)
Ilya Dryomov04077592014-07-23 17:11:20 +04006076 ret = rbd_spec_fill_snap_id(rbd_dev);
6077 else
6078 ret = rbd_spec_fill_names(rbd_dev);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006079 if (ret) {
6080 if (ret == -ENOENT)
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006081 pr_info("snap %s/%s%s%s@%s does not exist\n",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006082 rbd_dev->spec->pool_name,
Ilya Dryomovb26c0472018-07-03 15:28:43 +02006083 rbd_dev->spec->pool_ns ?: "",
6084 rbd_dev->spec->pool_ns ? "/" : "",
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006085 rbd_dev->spec->image_name,
6086 rbd_dev->spec->snap_name);
Alex Elder33dca392013-04-30 00:44:33 -05006087 goto err_out_probe;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006088 }
Alex Elder9bb81c92013-04-27 09:59:30 -05006089
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006090 if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
6091 ret = rbd_dev_v2_parent_info(rbd_dev);
6092 if (ret)
6093 goto err_out_probe;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006094 }
6095
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006096 ret = rbd_dev_probe_parent(rbd_dev, depth);
Alex Elder30d60ba2013-05-06 09:51:30 -05006097 if (ret)
6098 goto err_out_probe;
Alex Elder83a06262012-10-30 15:47:17 -05006099
Alex Elder30d60ba2013-05-06 09:51:30 -05006100 dout("discovered format %u image, header name is %s\n",
Ilya Dryomovc41d13a2016-04-29 20:01:25 +02006101 rbd_dev->image_format, rbd_dev->header_oid.name);
Alex Elder30d60ba2013-05-06 09:51:30 -05006102 return 0;
Ilya Dryomove8f59b52014-07-24 10:42:13 +04006103
Alex Elder6fd48b32013-04-28 23:32:34 -05006104err_out_probe:
6105 rbd_dev_unprobe(rbd_dev);
Alex Elderb644de22013-04-27 09:59:31 -05006106err_out_watch:
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006107 if (!depth)
Ilya Dryomov99d16942016-08-12 16:11:41 +02006108 rbd_unregister_watch(rbd_dev);
Alex Elder332bb122013-04-27 09:59:30 -05006109err_out_format:
6110 rbd_dev->image_format = 0;
Alex Elder5655c4d2013-04-25 23:15:08 -05006111 kfree(rbd_dev->spec->image_id);
6112 rbd_dev->spec->image_id = NULL;
Alex Elder5655c4d2013-04-25 23:15:08 -05006113 return ret;
Alex Eldera30b71b2012-07-10 20:30:11 -05006114}
6115
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006116static ssize_t do_rbd_add(struct bus_type *bus,
6117 const char *buf,
6118 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006119{
Alex Eldercb8627c2012-07-09 21:04:23 -05006120 struct rbd_device *rbd_dev = NULL;
Alex Elderdc79b112012-10-25 23:34:41 -05006121 struct ceph_options *ceph_opts = NULL;
Alex Elder4e9afeb2012-10-25 23:34:41 -05006122 struct rbd_options *rbd_opts = NULL;
Alex Elder859c31d2012-10-25 23:34:42 -05006123 struct rbd_spec *spec = NULL;
Alex Elder9d3997f2012-10-25 23:34:42 -05006124 struct rbd_client *rbdc;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006125 int rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006126
6127 if (!try_module_get(THIS_MODULE))
6128 return -ENODEV;
6129
Alex Eldera725f65e2012-02-02 08:13:30 -06006130 /* parse add command */
Alex Elder859c31d2012-10-25 23:34:42 -05006131 rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
Alex Elderdc79b112012-10-25 23:34:41 -05006132 if (rc < 0)
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006133 goto out;
Alex Eldera725f65e2012-02-02 08:13:30 -06006134
Alex Elder9d3997f2012-10-25 23:34:42 -05006135 rbdc = rbd_get_client(ceph_opts);
6136 if (IS_ERR(rbdc)) {
6137 rc = PTR_ERR(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006138 goto err_out_args;
Alex Elder9d3997f2012-10-25 23:34:42 -05006139 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006140
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006141 /* pick the pool */
Ilya Dryomovdd435852018-02-22 13:43:24 +01006142 rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006143 if (rc < 0) {
6144 if (rc == -ENOENT)
6145 pr_info("pool %s does not exist\n", spec->pool_name);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006146 goto err_out_client;
Ilya Dryomov1fe48022015-03-05 10:47:22 +03006147 }
Alex Elderc0cd10db2013-04-26 09:43:47 -05006148 spec->pool_id = (u64)rc;
Alex Elder859c31d2012-10-25 23:34:42 -05006149
Ilya Dryomovd1475432015-06-22 13:24:48 +03006150 rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006151 if (!rbd_dev) {
6152 rc = -ENOMEM;
Alex Elderbd4ba652012-10-25 23:34:42 -05006153 goto err_out_client;
Ilya Dryomovb51c83c2015-10-15 15:38:57 +02006154 }
Alex Elderc53d5892012-10-25 23:34:42 -05006155 rbdc = NULL; /* rbd_dev now owns this */
6156 spec = NULL; /* rbd_dev now owns this */
Ilya Dryomovd1475432015-06-22 13:24:48 +03006157 rbd_opts = NULL; /* rbd_dev now owns this */
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006158
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006159 rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
6160 if (!rbd_dev->config_info) {
6161 rc = -ENOMEM;
6162 goto err_out_rbd_dev;
6163 }
6164
Ilya Dryomov811c6682016-04-15 16:22:16 +02006165 down_write(&rbd_dev->header_rwsem);
Ilya Dryomov6d69bb532015-10-11 19:38:00 +02006166 rc = rbd_dev_image_probe(rbd_dev, 0);
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006167 if (rc < 0) {
6168 up_write(&rbd_dev->header_rwsem);
Alex Elderc53d5892012-10-25 23:34:42 -05006169 goto err_out_rbd_dev;
Mike Christie0d6d1e9c2016-08-18 18:38:45 +02006170 }
Alex Elder05fd6f62012-08-29 17:11:07 -05006171
Alex Elder7ce4eef2013-05-06 17:40:33 -05006172 /* If we are mapping a snapshot it must be marked read-only */
Alex Elder7ce4eef2013-05-06 17:40:33 -05006173 if (rbd_dev->spec->snap_id != CEPH_NOSNAP)
Ilya Dryomov9568c932017-10-12 12:35:19 +02006174 rbd_dev->opts->read_only = true;
Alex Elder7ce4eef2013-05-06 17:40:33 -05006175
Ilya Dryomov0c93e1b2019-01-30 15:14:48 +01006176 if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
6177 rbd_warn(rbd_dev, "alloc_size adjusted to %u",
6178 rbd_dev->layout.object_size);
6179 rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
6180 }
6181
Alex Elderb536f692013-04-28 23:32:34 -05006182 rc = rbd_dev_device_setup(rbd_dev);
Ilya Dryomovfd22aef2017-04-13 12:17:37 +02006183 if (rc)
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006184 goto err_out_image_probe;
Alex Elderb536f692013-04-28 23:32:34 -05006185
Ilya Dryomove010dd02017-04-13 12:17:39 +02006186 if (rbd_dev->opts->exclusive) {
6187 rc = rbd_add_acquire_lock(rbd_dev);
6188 if (rc)
6189 goto err_out_device_setup;
Alex Elderb536f692013-04-28 23:32:34 -05006190 }
6191
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006192 /* Everything's ready. Announce the disk to the world. */
6193
6194 rc = device_add(&rbd_dev->dev);
6195 if (rc)
Ilya Dryomove010dd02017-04-13 12:17:39 +02006196 goto err_out_image_lock;
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006197
6198 add_disk(rbd_dev->disk);
6199 /* see rbd_init_disk() */
6200 blk_put_queue(rbd_dev->disk->queue);
6201
6202 spin_lock(&rbd_dev_list_lock);
6203 list_add_tail(&rbd_dev->node, &rbd_dev_list);
6204 spin_unlock(&rbd_dev_list_lock);
6205
6206 pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
6207 (unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
6208 rbd_dev->header.features);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006209 rc = count;
6210out:
6211 module_put(THIS_MODULE);
6212 return rc;
Alex Elder3abef3b2013-05-13 20:35:37 -05006213
Ilya Dryomove010dd02017-04-13 12:17:39 +02006214err_out_image_lock:
6215 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006216err_out_device_setup:
6217 rbd_dev_device_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006218err_out_image_probe:
6219 rbd_dev_image_release(rbd_dev);
Alex Elderc53d5892012-10-25 23:34:42 -05006220err_out_rbd_dev:
6221 rbd_dev_destroy(rbd_dev);
Alex Elderbd4ba652012-10-25 23:34:42 -05006222err_out_client:
Alex Elder9d3997f2012-10-25 23:34:42 -05006223 rbd_put_client(rbdc);
Alex Elder0ddebc02012-10-25 23:34:41 -05006224err_out_args:
Alex Elder859c31d2012-10-25 23:34:42 -05006225 rbd_spec_put(spec);
Ilya Dryomovd1475432015-06-22 13:24:48 +03006226 kfree(rbd_opts);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006227 goto out;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006228}
6229
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006230static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006231{
6232 if (single_major)
6233 return -EINVAL;
6234
6235 return do_rbd_add(bus, buf, count);
6236}
6237
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006238static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
6239 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006240{
6241 return do_rbd_add(bus, buf, count);
6242}
6243
Alex Elder05a46af2013-04-26 15:44:36 -05006244static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
6245{
Alex Elderad945fc2013-04-26 15:44:36 -05006246 while (rbd_dev->parent) {
Alex Elder05a46af2013-04-26 15:44:36 -05006247 struct rbd_device *first = rbd_dev;
6248 struct rbd_device *second = first->parent;
6249 struct rbd_device *third;
6250
6251 /*
6252 * Follow to the parent with no grandparent and
6253 * remove it.
6254 */
6255 while (second && (third = second->parent)) {
6256 first = second;
6257 second = third;
6258 }
Alex Elderad945fc2013-04-26 15:44:36 -05006259 rbd_assert(second);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006260 rbd_dev_image_release(second);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006261 rbd_dev_destroy(second);
Alex Elderad945fc2013-04-26 15:44:36 -05006262 first->parent = NULL;
6263 first->parent_overlap = 0;
6264
6265 rbd_assert(first->parent_spec);
Alex Elder05a46af2013-04-26 15:44:36 -05006266 rbd_spec_put(first->parent_spec);
6267 first->parent_spec = NULL;
Alex Elder05a46af2013-04-26 15:44:36 -05006268 }
6269}
6270
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006271static ssize_t do_rbd_remove(struct bus_type *bus,
6272 const char *buf,
6273 size_t count)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006274{
6275 struct rbd_device *rbd_dev = NULL;
Alex Elder751cc0e2013-05-31 15:17:01 -05006276 struct list_head *tmp;
6277 int dev_id;
Mike Christie0276dca2016-08-18 18:38:45 +02006278 char opt_buf[6];
Mike Christie0276dca2016-08-18 18:38:45 +02006279 bool force = false;
Alex Elder0d8189e2013-04-27 09:59:30 -05006280 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006281
Mike Christie0276dca2016-08-18 18:38:45 +02006282 dev_id = -1;
6283 opt_buf[0] = '\0';
6284 sscanf(buf, "%d %5s", &dev_id, opt_buf);
6285 if (dev_id < 0) {
6286 pr_err("dev_id out of range\n");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006287 return -EINVAL;
Mike Christie0276dca2016-08-18 18:38:45 +02006288 }
6289 if (opt_buf[0] != '\0') {
6290 if (!strcmp(opt_buf, "force")) {
6291 force = true;
6292 } else {
6293 pr_err("bad remove option at '%s'\n", opt_buf);
6294 return -EINVAL;
6295 }
6296 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006297
Alex Elder751cc0e2013-05-31 15:17:01 -05006298 ret = -ENOENT;
6299 spin_lock(&rbd_dev_list_lock);
6300 list_for_each(tmp, &rbd_dev_list) {
6301 rbd_dev = list_entry(tmp, struct rbd_device, node);
6302 if (rbd_dev->dev_id == dev_id) {
6303 ret = 0;
6304 break;
6305 }
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006306 }
Alex Elder751cc0e2013-05-31 15:17:01 -05006307 if (!ret) {
6308 spin_lock_irq(&rbd_dev->lock);
Mike Christie0276dca2016-08-18 18:38:45 +02006309 if (rbd_dev->open_count && !force)
Alex Elder751cc0e2013-05-31 15:17:01 -05006310 ret = -EBUSY;
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006311 else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
6312 &rbd_dev->flags))
6313 ret = -EINPROGRESS;
Alex Elder751cc0e2013-05-31 15:17:01 -05006314 spin_unlock_irq(&rbd_dev->lock);
6315 }
6316 spin_unlock(&rbd_dev_list_lock);
Ilya Dryomov85f5a4d2019-01-08 19:47:38 +01006317 if (ret)
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006318 return ret;
Alex Elder751cc0e2013-05-31 15:17:01 -05006319
Mike Christie0276dca2016-08-18 18:38:45 +02006320 if (force) {
6321 /*
6322 * Prevent new IO from being queued and wait for existing
6323 * IO to complete/fail.
6324 */
6325 blk_mq_freeze_queue(rbd_dev->disk->queue);
6326 blk_set_queue_dying(rbd_dev->disk->queue);
6327 }
6328
Ilya Dryomov5769ed02017-04-13 12:17:38 +02006329 del_gendisk(rbd_dev->disk);
6330 spin_lock(&rbd_dev_list_lock);
6331 list_del_init(&rbd_dev->node);
6332 spin_unlock(&rbd_dev_list_lock);
6333 device_del(&rbd_dev->dev);
Ilya Dryomovfca27062013-12-16 18:02:40 +02006334
Ilya Dryomove010dd02017-04-13 12:17:39 +02006335 rbd_dev_image_unlock(rbd_dev);
Ilya Dryomovdd5ac322015-10-16 17:09:24 +02006336 rbd_dev_device_release(rbd_dev);
Alex Elder8ad42cd2013-04-28 23:32:34 -05006337 rbd_dev_image_release(rbd_dev);
Ilya Dryomov8b679ec2017-04-13 12:17:37 +02006338 rbd_dev_destroy(rbd_dev);
Alex Elder1ba0f1e2013-05-31 15:17:01 -05006339 return count;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006340}
6341
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006342static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006343{
6344 if (single_major)
6345 return -EINVAL;
6346
6347 return do_rbd_remove(bus, buf, count);
6348}
6349
Greg Kroah-Hartman7e9586b2018-12-21 08:54:38 +01006350static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
6351 size_t count)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006352{
6353 return do_rbd_remove(bus, buf, count);
6354}
6355
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006356/*
6357 * create control files in sysfs
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006358 * /sys/bus/rbd/...
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006359 */
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006360static int __init rbd_sysfs_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006361{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006362 int ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006363
Alex Elderfed4c142012-02-07 12:03:36 -06006364 ret = device_register(&rbd_root_dev);
Alex Elder21079782012-01-24 10:08:36 -06006365 if (ret < 0)
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006366 return ret;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006367
Alex Elderfed4c142012-02-07 12:03:36 -06006368 ret = bus_register(&rbd_bus_type);
6369 if (ret < 0)
6370 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006371
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006372 return ret;
6373}
6374
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006375static void __exit rbd_sysfs_cleanup(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006376{
Yehuda Sadehdfc56062010-11-19 14:51:04 -08006377 bus_unregister(&rbd_bus_type);
Alex Elderfed4c142012-02-07 12:03:36 -06006378 device_unregister(&rbd_root_dev);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006379}
6380
Chengguang Xu7d8dc532018-08-12 23:06:54 +08006381static int __init rbd_slab_init(void)
Alex Elder1c2a9df2013-05-01 12:43:03 -05006382{
6383 rbd_assert(!rbd_img_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006384 rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
Alex Elder868311b2013-05-01 12:43:03 -05006385 if (!rbd_img_request_cache)
6386 return -ENOMEM;
6387
6388 rbd_assert(!rbd_obj_request_cache);
Geliang Tang03d94402016-03-13 15:17:32 +08006389 rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
Alex Elder78c2a442013-05-01 12:43:04 -05006390 if (!rbd_obj_request_cache)
6391 goto out_err;
6392
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006393 return 0;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006394
Ilya Dryomov6c696d82017-01-25 18:16:23 +01006395out_err:
Alex Elder868311b2013-05-01 12:43:03 -05006396 kmem_cache_destroy(rbd_img_request_cache);
6397 rbd_img_request_cache = NULL;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006398 return -ENOMEM;
6399}
6400
6401static void rbd_slab_exit(void)
6402{
Alex Elder868311b2013-05-01 12:43:03 -05006403 rbd_assert(rbd_obj_request_cache);
6404 kmem_cache_destroy(rbd_obj_request_cache);
6405 rbd_obj_request_cache = NULL;
6406
Alex Elder1c2a9df2013-05-01 12:43:03 -05006407 rbd_assert(rbd_img_request_cache);
6408 kmem_cache_destroy(rbd_img_request_cache);
6409 rbd_img_request_cache = NULL;
6410}
6411
Alex Eldercc344fa2013-02-19 12:25:56 -06006412static int __init rbd_init(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006413{
6414 int rc;
6415
Alex Elder1e32d342013-01-30 11:13:33 -06006416 if (!libceph_compatible(NULL)) {
6417 rbd_warn(NULL, "libceph incompatibility (quitting)");
Alex Elder1e32d342013-01-30 11:13:33 -06006418 return -EINVAL;
6419 }
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006420
Alex Elder1c2a9df2013-05-01 12:43:03 -05006421 rc = rbd_slab_init();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006422 if (rc)
6423 return rc;
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006424
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006425 /*
6426 * The number of active work items is limited by the number of
Ilya Dryomovf77303b2015-04-22 18:28:13 +03006427 * rbd devices * queue depth, so leave @max_active at default.
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006428 */
6429 rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
6430 if (!rbd_wq) {
6431 rc = -ENOMEM;
6432 goto err_out_slab;
6433 }
6434
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006435 if (single_major) {
6436 rbd_major = register_blkdev(0, RBD_DRV_NAME);
6437 if (rbd_major < 0) {
6438 rc = rbd_major;
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006439 goto err_out_wq;
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006440 }
6441 }
6442
Alex Elder1c2a9df2013-05-01 12:43:03 -05006443 rc = rbd_sysfs_init();
6444 if (rc)
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006445 goto err_out_blkdev;
Alex Elder1c2a9df2013-05-01 12:43:03 -05006446
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006447 if (single_major)
6448 pr_info("loaded (major %d)\n", rbd_major);
6449 else
6450 pr_info("loaded\n");
6451
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006452 return 0;
6453
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006454err_out_blkdev:
6455 if (single_major)
6456 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006457err_out_wq:
6458 destroy_workqueue(rbd_wq);
Ilya Dryomove1b4d962013-12-13 15:28:57 +02006459err_out_slab:
6460 rbd_slab_exit();
Alex Elder1c2a9df2013-05-01 12:43:03 -05006461 return rc;
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006462}
6463
Alex Eldercc344fa2013-02-19 12:25:56 -06006464static void __exit rbd_exit(void)
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006465{
Ilya Dryomovffe312c2014-05-20 15:46:04 +04006466 ida_destroy(&rbd_dev_id_ida);
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006467 rbd_sysfs_cleanup();
Ilya Dryomov9b60e702013-12-13 15:28:57 +02006468 if (single_major)
6469 unregister_blkdev(rbd_major, RBD_DRV_NAME);
Ilya Dryomovf5ee37b2014-10-09 17:06:01 +04006470 destroy_workqueue(rbd_wq);
Alex Elder1c2a9df2013-05-01 12:43:03 -05006471 rbd_slab_exit();
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006472}
6473
6474module_init(rbd_init);
6475module_exit(rbd_exit);
6476
Alex Elderd552c612013-05-31 20:13:09 -05006477MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006478MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
6479MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006480/* following authorship retained from original osdblk.c */
6481MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
6482
Ilya Dryomov90da2582013-12-13 15:28:56 +02006483MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
Yehuda Sadeh602adf42010-08-12 16:11:25 -07006484MODULE_LICENSE("GPL");