blob: 34aa9f4c1780a47ba1b420dc9fb877b04d5afa27 [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001// SPDX-License-Identifier: GPL-2.0
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002#include <linux/ceph/ceph_debug.h>
Sage Weila8599bd2009-10-06 11:31:12 -07003
4#include <linux/fs.h>
5#include <linux/kernel.h>
Ingo Molnar174cd4b2017-02-02 19:15:33 +01006#include <linux/sched/signal.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +09007#include <linux/slab.h>
Sage Weila8599bd2009-10-06 11:31:12 -07008#include <linux/vmalloc.h>
9#include <linux/wait.h>
Stephen Rothwellf1a3d572010-01-18 11:53:08 +110010#include <linux/writeback.h>
Jeff Layton176c77c2019-06-06 08:06:40 -040011#include <linux/iversion.h>
Sage Weila8599bd2009-10-06 11:31:12 -070012
13#include "super.h"
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -070014#include "mds_client.h"
Milosz Tanski99ccbd22013-08-21 17:29:54 -040015#include "cache.h"
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -070016#include <linux/ceph/decode.h>
17#include <linux/ceph/messenger.h>
Sage Weila8599bd2009-10-06 11:31:12 -070018
19/*
20 * Capability management
21 *
22 * The Ceph metadata servers control client access to inode metadata
23 * and file data by issuing capabilities, granting clients permission
24 * to read and/or write both inode field and file data to OSDs
25 * (storage nodes). Each capability consists of a set of bits
26 * indicating which operations are allowed.
27 *
28 * If the client holds a *_SHARED cap, the client has a coherent value
29 * that can be safely read from the cached inode.
30 *
31 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
32 * client is allowed to change inode attributes (e.g., file size,
33 * mtime), note its dirty state in the ceph_cap, and asynchronously
34 * flush that metadata change to the MDS.
35 *
36 * In the event of a conflicting operation (perhaps by another
37 * client), the MDS will revoke the conflicting client capabilities.
38 *
39 * In order for a client to cache an inode, it must hold a capability
40 * with at least one MDS server. When inodes are released, release
41 * notifications are batched and periodically sent en masse to the MDS
42 * cluster to release server state.
43 */
44
Yan, Zheng0e294382016-07-04 18:06:41 +080045static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
Yan, Zheng7bc00fd2016-07-07 18:34:45 +080046static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
47 struct ceph_mds_session *session,
48 struct ceph_inode_info *ci,
49 u64 oldest_flush_tid);
Sage Weila8599bd2009-10-06 11:31:12 -070050
51/*
52 * Generate readable cap strings for debugging output.
53 */
54#define MAX_CAP_STR 20
55static char cap_str[MAX_CAP_STR][40];
56static DEFINE_SPINLOCK(cap_str_lock);
57static int last_cap_str;
58
59static char *gcap_string(char *s, int c)
60{
61 if (c & CEPH_CAP_GSHARED)
62 *s++ = 's';
63 if (c & CEPH_CAP_GEXCL)
64 *s++ = 'x';
65 if (c & CEPH_CAP_GCACHE)
66 *s++ = 'c';
67 if (c & CEPH_CAP_GRD)
68 *s++ = 'r';
69 if (c & CEPH_CAP_GWR)
70 *s++ = 'w';
71 if (c & CEPH_CAP_GBUFFER)
72 *s++ = 'b';
Yan, Zheng49a9f4f2018-04-25 17:30:23 +080073 if (c & CEPH_CAP_GWREXTEND)
74 *s++ = 'a';
Sage Weila8599bd2009-10-06 11:31:12 -070075 if (c & CEPH_CAP_GLAZYIO)
76 *s++ = 'l';
77 return s;
78}
79
80const char *ceph_cap_string(int caps)
81{
82 int i;
83 char *s;
84 int c;
85
86 spin_lock(&cap_str_lock);
87 i = last_cap_str++;
88 if (last_cap_str == MAX_CAP_STR)
89 last_cap_str = 0;
90 spin_unlock(&cap_str_lock);
91
92 s = cap_str[i];
93
94 if (caps & CEPH_CAP_PIN)
95 *s++ = 'p';
96
97 c = (caps >> CEPH_CAP_SAUTH) & 3;
98 if (c) {
99 *s++ = 'A';
100 s = gcap_string(s, c);
101 }
102
103 c = (caps >> CEPH_CAP_SLINK) & 3;
104 if (c) {
105 *s++ = 'L';
106 s = gcap_string(s, c);
107 }
108
109 c = (caps >> CEPH_CAP_SXATTR) & 3;
110 if (c) {
111 *s++ = 'X';
112 s = gcap_string(s, c);
113 }
114
115 c = caps >> CEPH_CAP_SFILE;
116 if (c) {
117 *s++ = 'F';
118 s = gcap_string(s, c);
119 }
120
121 if (s == cap_str[i])
122 *s++ = '-';
123 *s = 0;
124 return cap_str[i];
125}
126
Yehuda Sadeh37151662010-06-17 16:16:12 -0700127void ceph_caps_init(struct ceph_mds_client *mdsc)
Sage Weila8599bd2009-10-06 11:31:12 -0700128{
Yehuda Sadeh37151662010-06-17 16:16:12 -0700129 INIT_LIST_HEAD(&mdsc->caps_list);
130 spin_lock_init(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700131}
132
Yehuda Sadeh37151662010-06-17 16:16:12 -0700133void ceph_caps_finalize(struct ceph_mds_client *mdsc)
Sage Weila8599bd2009-10-06 11:31:12 -0700134{
135 struct ceph_cap *cap;
136
Yehuda Sadeh37151662010-06-17 16:16:12 -0700137 spin_lock(&mdsc->caps_list_lock);
138 while (!list_empty(&mdsc->caps_list)) {
139 cap = list_first_entry(&mdsc->caps_list,
140 struct ceph_cap, caps_item);
Sage Weila8599bd2009-10-06 11:31:12 -0700141 list_del(&cap->caps_item);
142 kmem_cache_free(ceph_cap_cachep, cap);
143 }
Yehuda Sadeh37151662010-06-17 16:16:12 -0700144 mdsc->caps_total_count = 0;
145 mdsc->caps_avail_count = 0;
146 mdsc->caps_use_count = 0;
147 mdsc->caps_reserve_count = 0;
148 mdsc->caps_min_count = 0;
149 spin_unlock(&mdsc->caps_list_lock);
Sage Weil85ccce42010-02-17 10:02:43 -0800150}
151
Yan, Zhengfe330322019-02-01 14:57:15 +0800152void ceph_adjust_caps_max_min(struct ceph_mds_client *mdsc,
153 struct ceph_mount_options *fsopt)
Sage Weil85ccce42010-02-17 10:02:43 -0800154{
Yehuda Sadeh37151662010-06-17 16:16:12 -0700155 spin_lock(&mdsc->caps_list_lock);
Yan, Zhengfe330322019-02-01 14:57:15 +0800156 mdsc->caps_min_count = fsopt->max_readdir;
157 if (mdsc->caps_min_count < 1024)
158 mdsc->caps_min_count = 1024;
159 mdsc->caps_use_max = fsopt->caps_max;
160 if (mdsc->caps_use_max > 0 &&
161 mdsc->caps_use_max < mdsc->caps_min_count)
162 mdsc->caps_use_max = mdsc->caps_min_count;
Yehuda Sadeh37151662010-06-17 16:16:12 -0700163 spin_unlock(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700164}
165
Chengguang Xu7bf8f732018-07-28 23:15:35 +0800166static void __ceph_unreserve_caps(struct ceph_mds_client *mdsc, int nr_caps)
167{
168 struct ceph_cap *cap;
169 int i;
170
171 if (nr_caps) {
172 BUG_ON(mdsc->caps_reserve_count < nr_caps);
173 mdsc->caps_reserve_count -= nr_caps;
174 if (mdsc->caps_avail_count >=
175 mdsc->caps_reserve_count + mdsc->caps_min_count) {
176 mdsc->caps_total_count -= nr_caps;
177 for (i = 0; i < nr_caps; i++) {
178 cap = list_first_entry(&mdsc->caps_list,
179 struct ceph_cap, caps_item);
180 list_del(&cap->caps_item);
181 kmem_cache_free(ceph_cap_cachep, cap);
182 }
183 } else {
184 mdsc->caps_avail_count += nr_caps;
185 }
186
187 dout("%s: caps %d = %d used + %d resv + %d avail\n",
188 __func__,
189 mdsc->caps_total_count, mdsc->caps_use_count,
190 mdsc->caps_reserve_count, mdsc->caps_avail_count);
191 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
192 mdsc->caps_reserve_count +
193 mdsc->caps_avail_count);
194 }
195}
196
Zhi Zhange30ee582018-01-24 21:24:33 +0800197/*
198 * Called under mdsc->mutex.
199 */
200int ceph_reserve_caps(struct ceph_mds_client *mdsc,
Yehuda Sadeh37151662010-06-17 16:16:12 -0700201 struct ceph_cap_reservation *ctx, int need)
Sage Weila8599bd2009-10-06 11:31:12 -0700202{
Zhi Zhange30ee582018-01-24 21:24:33 +0800203 int i, j;
Sage Weila8599bd2009-10-06 11:31:12 -0700204 struct ceph_cap *cap;
205 int have;
206 int alloc = 0;
Zhi Zhange30ee582018-01-24 21:24:33 +0800207 int max_caps;
Chengguang Xue5bc08d2018-07-28 23:15:37 +0800208 int err = 0;
Zhi Zhange30ee582018-01-24 21:24:33 +0800209 bool trimmed = false;
210 struct ceph_mds_session *s;
Sage Weila8599bd2009-10-06 11:31:12 -0700211 LIST_HEAD(newcaps);
Sage Weila8599bd2009-10-06 11:31:12 -0700212
213 dout("reserve caps ctx=%p need=%d\n", ctx, need);
214
215 /* first reserve any caps that are already allocated */
Yehuda Sadeh37151662010-06-17 16:16:12 -0700216 spin_lock(&mdsc->caps_list_lock);
217 if (mdsc->caps_avail_count >= need)
Sage Weila8599bd2009-10-06 11:31:12 -0700218 have = need;
219 else
Yehuda Sadeh37151662010-06-17 16:16:12 -0700220 have = mdsc->caps_avail_count;
221 mdsc->caps_avail_count -= have;
222 mdsc->caps_reserve_count += have;
223 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
224 mdsc->caps_reserve_count +
225 mdsc->caps_avail_count);
226 spin_unlock(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700227
Chengguang Xu79cd6742018-02-24 18:36:02 +0800228 for (i = have; i < need; ) {
Sage Weila8599bd2009-10-06 11:31:12 -0700229 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
Chengguang Xu79cd6742018-02-24 18:36:02 +0800230 if (cap) {
231 list_add(&cap->caps_item, &newcaps);
232 alloc++;
233 i++;
234 continue;
Zhi Zhange30ee582018-01-24 21:24:33 +0800235 }
Chengguang Xu79cd6742018-02-24 18:36:02 +0800236
237 if (!trimmed) {
238 for (j = 0; j < mdsc->max_sessions; j++) {
239 s = __ceph_lookup_mds_session(mdsc, j);
240 if (!s)
241 continue;
242 mutex_unlock(&mdsc->mutex);
243
244 mutex_lock(&s->s_mutex);
245 max_caps = s->s_nr_caps - (need - i);
246 ceph_trim_caps(mdsc, s, max_caps);
247 mutex_unlock(&s->s_mutex);
248
249 ceph_put_mds_session(s);
250 mutex_lock(&mdsc->mutex);
251 }
252 trimmed = true;
253
254 spin_lock(&mdsc->caps_list_lock);
255 if (mdsc->caps_avail_count) {
256 int more_have;
257 if (mdsc->caps_avail_count >= need - i)
258 more_have = need - i;
259 else
260 more_have = mdsc->caps_avail_count;
261
262 i += more_have;
263 have += more_have;
264 mdsc->caps_avail_count -= more_have;
265 mdsc->caps_reserve_count += more_have;
266
267 }
268 spin_unlock(&mdsc->caps_list_lock);
269
270 continue;
271 }
272
273 pr_warn("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
274 ctx, need, have + alloc);
Chengguang Xue5bc08d2018-07-28 23:15:37 +0800275 err = -ENOMEM;
276 break;
Sage Weila8599bd2009-10-06 11:31:12 -0700277 }
Chengguang Xue5bc08d2018-07-28 23:15:37 +0800278
279 if (!err) {
280 BUG_ON(have + alloc != need);
281 ctx->count = need;
Yan, Zhengfe330322019-02-01 14:57:15 +0800282 ctx->used = 0;
Chengguang Xue5bc08d2018-07-28 23:15:37 +0800283 }
Sage Weila8599bd2009-10-06 11:31:12 -0700284
Yehuda Sadeh37151662010-06-17 16:16:12 -0700285 spin_lock(&mdsc->caps_list_lock);
286 mdsc->caps_total_count += alloc;
287 mdsc->caps_reserve_count += alloc;
288 list_splice(&newcaps, &mdsc->caps_list);
Sage Weila8599bd2009-10-06 11:31:12 -0700289
Yehuda Sadeh37151662010-06-17 16:16:12 -0700290 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
291 mdsc->caps_reserve_count +
292 mdsc->caps_avail_count);
Chengguang Xue5bc08d2018-07-28 23:15:37 +0800293
294 if (err)
295 __ceph_unreserve_caps(mdsc, have + alloc);
296
Yehuda Sadeh37151662010-06-17 16:16:12 -0700297 spin_unlock(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700298
Sage Weila8599bd2009-10-06 11:31:12 -0700299 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
Yehuda Sadeh37151662010-06-17 16:16:12 -0700300 ctx, mdsc->caps_total_count, mdsc->caps_use_count,
301 mdsc->caps_reserve_count, mdsc->caps_avail_count);
Chengguang Xue5bc08d2018-07-28 23:15:37 +0800302 return err;
Sage Weila8599bd2009-10-06 11:31:12 -0700303}
304
Chengguang Xu7bf8f732018-07-28 23:15:35 +0800305void ceph_unreserve_caps(struct ceph_mds_client *mdsc,
Yan, Zhengfe330322019-02-01 14:57:15 +0800306 struct ceph_cap_reservation *ctx)
Sage Weila8599bd2009-10-06 11:31:12 -0700307{
Yan, Zhengfe330322019-02-01 14:57:15 +0800308 bool reclaim = false;
309 if (!ctx->count)
310 return;
311
Sage Weila8599bd2009-10-06 11:31:12 -0700312 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
Chengguang Xu7bf8f732018-07-28 23:15:35 +0800313 spin_lock(&mdsc->caps_list_lock);
314 __ceph_unreserve_caps(mdsc, ctx->count);
315 ctx->count = 0;
Yan, Zhengfe330322019-02-01 14:57:15 +0800316
317 if (mdsc->caps_use_max > 0 &&
318 mdsc->caps_use_count > mdsc->caps_use_max)
319 reclaim = true;
Chengguang Xu7bf8f732018-07-28 23:15:35 +0800320 spin_unlock(&mdsc->caps_list_lock);
Yan, Zhengfe330322019-02-01 14:57:15 +0800321
322 if (reclaim)
323 ceph_reclaim_caps_nr(mdsc, ctx->used);
Sage Weila8599bd2009-10-06 11:31:12 -0700324}
325
Yan, Zhengd9df2782014-04-18 09:57:11 +0800326struct ceph_cap *ceph_get_cap(struct ceph_mds_client *mdsc,
327 struct ceph_cap_reservation *ctx)
Sage Weila8599bd2009-10-06 11:31:12 -0700328{
329 struct ceph_cap *cap = NULL;
330
331 /* temporary, until we do something about cap import/export */
Sage Weil443b3762010-06-29 09:28:39 -0700332 if (!ctx) {
333 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
334 if (cap) {
Yan, Zheng4d1d05342012-11-03 10:32:37 +0800335 spin_lock(&mdsc->caps_list_lock);
Yehuda Sadeh37151662010-06-17 16:16:12 -0700336 mdsc->caps_use_count++;
337 mdsc->caps_total_count++;
Yan, Zheng4d1d05342012-11-03 10:32:37 +0800338 spin_unlock(&mdsc->caps_list_lock);
Chengguang Xue327ce02018-02-24 18:35:29 +0800339 } else {
340 spin_lock(&mdsc->caps_list_lock);
341 if (mdsc->caps_avail_count) {
342 BUG_ON(list_empty(&mdsc->caps_list));
343
344 mdsc->caps_avail_count--;
345 mdsc->caps_use_count++;
346 cap = list_first_entry(&mdsc->caps_list,
347 struct ceph_cap, caps_item);
348 list_del(&cap->caps_item);
349
350 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
351 mdsc->caps_reserve_count + mdsc->caps_avail_count);
352 }
353 spin_unlock(&mdsc->caps_list_lock);
Sage Weil443b3762010-06-29 09:28:39 -0700354 }
Chengguang Xue327ce02018-02-24 18:35:29 +0800355
Sage Weil443b3762010-06-29 09:28:39 -0700356 return cap;
357 }
Sage Weila8599bd2009-10-06 11:31:12 -0700358
Yehuda Sadeh37151662010-06-17 16:16:12 -0700359 spin_lock(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700360 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
Yehuda Sadeh37151662010-06-17 16:16:12 -0700361 ctx, ctx->count, mdsc->caps_total_count, mdsc->caps_use_count,
362 mdsc->caps_reserve_count, mdsc->caps_avail_count);
Sage Weila8599bd2009-10-06 11:31:12 -0700363 BUG_ON(!ctx->count);
Yehuda Sadeh37151662010-06-17 16:16:12 -0700364 BUG_ON(ctx->count > mdsc->caps_reserve_count);
365 BUG_ON(list_empty(&mdsc->caps_list));
Sage Weila8599bd2009-10-06 11:31:12 -0700366
367 ctx->count--;
Yan, Zhengfe330322019-02-01 14:57:15 +0800368 ctx->used++;
Yehuda Sadeh37151662010-06-17 16:16:12 -0700369 mdsc->caps_reserve_count--;
370 mdsc->caps_use_count++;
Sage Weila8599bd2009-10-06 11:31:12 -0700371
Yehuda Sadeh37151662010-06-17 16:16:12 -0700372 cap = list_first_entry(&mdsc->caps_list, struct ceph_cap, caps_item);
Sage Weila8599bd2009-10-06 11:31:12 -0700373 list_del(&cap->caps_item);
374
Yehuda Sadeh37151662010-06-17 16:16:12 -0700375 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
376 mdsc->caps_reserve_count + mdsc->caps_avail_count);
377 spin_unlock(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700378 return cap;
379}
380
Yehuda Sadeh37151662010-06-17 16:16:12 -0700381void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap)
Sage Weila8599bd2009-10-06 11:31:12 -0700382{
Yehuda Sadeh37151662010-06-17 16:16:12 -0700383 spin_lock(&mdsc->caps_list_lock);
Sage Weil7c1332b2010-02-16 11:39:45 -0800384 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
Yehuda Sadeh37151662010-06-17 16:16:12 -0700385 cap, mdsc->caps_total_count, mdsc->caps_use_count,
386 mdsc->caps_reserve_count, mdsc->caps_avail_count);
387 mdsc->caps_use_count--;
Sage Weila8599bd2009-10-06 11:31:12 -0700388 /*
Sage Weil85ccce42010-02-17 10:02:43 -0800389 * Keep some preallocated caps around (ceph_min_count), to
390 * avoid lots of free/alloc churn.
Sage Weila8599bd2009-10-06 11:31:12 -0700391 */
Yehuda Sadeh37151662010-06-17 16:16:12 -0700392 if (mdsc->caps_avail_count >= mdsc->caps_reserve_count +
393 mdsc->caps_min_count) {
394 mdsc->caps_total_count--;
Sage Weila8599bd2009-10-06 11:31:12 -0700395 kmem_cache_free(ceph_cap_cachep, cap);
396 } else {
Yehuda Sadeh37151662010-06-17 16:16:12 -0700397 mdsc->caps_avail_count++;
398 list_add(&cap->caps_item, &mdsc->caps_list);
Sage Weila8599bd2009-10-06 11:31:12 -0700399 }
400
Yehuda Sadeh37151662010-06-17 16:16:12 -0700401 BUG_ON(mdsc->caps_total_count != mdsc->caps_use_count +
402 mdsc->caps_reserve_count + mdsc->caps_avail_count);
403 spin_unlock(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700404}
405
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700406void ceph_reservation_status(struct ceph_fs_client *fsc,
Sage Weil85ccce42010-02-17 10:02:43 -0800407 int *total, int *avail, int *used, int *reserved,
408 int *min)
Sage Weila8599bd2009-10-06 11:31:12 -0700409{
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700410 struct ceph_mds_client *mdsc = fsc->mdsc;
Yehuda Sadeh37151662010-06-17 16:16:12 -0700411
Chengguang Xub8840142018-02-23 17:09:38 +0800412 spin_lock(&mdsc->caps_list_lock);
413
Sage Weila8599bd2009-10-06 11:31:12 -0700414 if (total)
Yehuda Sadeh37151662010-06-17 16:16:12 -0700415 *total = mdsc->caps_total_count;
Sage Weila8599bd2009-10-06 11:31:12 -0700416 if (avail)
Yehuda Sadeh37151662010-06-17 16:16:12 -0700417 *avail = mdsc->caps_avail_count;
Sage Weila8599bd2009-10-06 11:31:12 -0700418 if (used)
Yehuda Sadeh37151662010-06-17 16:16:12 -0700419 *used = mdsc->caps_use_count;
Sage Weila8599bd2009-10-06 11:31:12 -0700420 if (reserved)
Yehuda Sadeh37151662010-06-17 16:16:12 -0700421 *reserved = mdsc->caps_reserve_count;
Sage Weil85ccce42010-02-17 10:02:43 -0800422 if (min)
Yehuda Sadeh37151662010-06-17 16:16:12 -0700423 *min = mdsc->caps_min_count;
Chengguang Xub8840142018-02-23 17:09:38 +0800424
425 spin_unlock(&mdsc->caps_list_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700426}
427
428/*
429 * Find ceph_cap for given mds, if any.
430 *
Sage Weilbe655592011-11-30 09:47:09 -0800431 * Called with i_ceph_lock held.
Sage Weila8599bd2009-10-06 11:31:12 -0700432 */
433static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
434{
435 struct ceph_cap *cap;
436 struct rb_node *n = ci->i_caps.rb_node;
437
438 while (n) {
439 cap = rb_entry(n, struct ceph_cap, ci_node);
440 if (mds < cap->mds)
441 n = n->rb_left;
442 else if (mds > cap->mds)
443 n = n->rb_right;
444 else
445 return cap;
446 }
447 return NULL;
448}
449
Greg Farnum2bc50252010-06-30 12:44:34 -0700450struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, int mds)
451{
452 struct ceph_cap *cap;
453
Sage Weilbe655592011-11-30 09:47:09 -0800454 spin_lock(&ci->i_ceph_lock);
Greg Farnum2bc50252010-06-30 12:44:34 -0700455 cap = __get_cap_for_mds(ci, mds);
Sage Weilbe655592011-11-30 09:47:09 -0800456 spin_unlock(&ci->i_ceph_lock);
Greg Farnum2bc50252010-06-30 12:44:34 -0700457 return cap;
458}
459
Sage Weila8599bd2009-10-06 11:31:12 -0700460/*
Sage Weilbe655592011-11-30 09:47:09 -0800461 * Called under i_ceph_lock.
Sage Weila8599bd2009-10-06 11:31:12 -0700462 */
463static void __insert_cap_node(struct ceph_inode_info *ci,
464 struct ceph_cap *new)
465{
466 struct rb_node **p = &ci->i_caps.rb_node;
467 struct rb_node *parent = NULL;
468 struct ceph_cap *cap = NULL;
469
470 while (*p) {
471 parent = *p;
472 cap = rb_entry(parent, struct ceph_cap, ci_node);
473 if (new->mds < cap->mds)
474 p = &(*p)->rb_left;
475 else if (new->mds > cap->mds)
476 p = &(*p)->rb_right;
477 else
478 BUG();
479 }
480
481 rb_link_node(&new->ci_node, parent, p);
482 rb_insert_color(&new->ci_node, &ci->i_caps);
483}
484
485/*
486 * (re)set cap hold timeouts, which control the delayed release
487 * of unused caps back to the MDS. Should be called on cap use.
488 */
489static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
490 struct ceph_inode_info *ci)
491{
Yan, Zhengfe330322019-02-01 14:57:15 +0800492 struct ceph_mount_options *opt = mdsc->fsc->mount_options;
Sage Weila8599bd2009-10-06 11:31:12 -0700493 ci->i_hold_caps_max = round_jiffies(jiffies +
Yan, Zhengfe330322019-02-01 14:57:15 +0800494 opt->caps_wanted_delay_max * HZ);
Yan, Zhenga0d93e32020-03-05 20:21:01 +0800495 dout("__cap_set_timeouts %p %lu\n", &ci->vfs_inode,
496 ci->i_hold_caps_max - jiffies);
Sage Weila8599bd2009-10-06 11:31:12 -0700497}
498
499/*
500 * (Re)queue cap at the end of the delayed cap release list.
501 *
502 * If I_FLUSH is set, leave the inode at the front of the list.
503 *
Sage Weilbe655592011-11-30 09:47:09 -0800504 * Caller holds i_ceph_lock
Sage Weila8599bd2009-10-06 11:31:12 -0700505 * -> we take mdsc->cap_delay_lock
506 */
507static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
Yan, Zhenga0d93e32020-03-05 20:21:01 +0800508 struct ceph_inode_info *ci)
Sage Weila8599bd2009-10-06 11:31:12 -0700509{
Jeff Layton891f3f52020-01-14 15:06:40 -0500510 dout("__cap_delay_requeue %p flags 0x%lx at %lu\n", &ci->vfs_inode,
Sage Weila8599bd2009-10-06 11:31:12 -0700511 ci->i_ceph_flags, ci->i_hold_caps_max);
512 if (!mdsc->stopping) {
513 spin_lock(&mdsc->cap_delay_lock);
514 if (!list_empty(&ci->i_cap_delay_list)) {
515 if (ci->i_ceph_flags & CEPH_I_FLUSH)
516 goto no_change;
517 list_del_init(&ci->i_cap_delay_list);
518 }
Yan, Zhenga0d93e32020-03-05 20:21:01 +0800519 __cap_set_timeouts(mdsc, ci);
Sage Weila8599bd2009-10-06 11:31:12 -0700520 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
521no_change:
522 spin_unlock(&mdsc->cap_delay_lock);
523 }
524}
525
526/*
527 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
528 * indicating we should send a cap message to flush dirty metadata
529 * asap, and move to the front of the delayed cap list.
530 */
531static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
532 struct ceph_inode_info *ci)
533{
534 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
535 spin_lock(&mdsc->cap_delay_lock);
536 ci->i_ceph_flags |= CEPH_I_FLUSH;
537 if (!list_empty(&ci->i_cap_delay_list))
538 list_del_init(&ci->i_cap_delay_list);
539 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
540 spin_unlock(&mdsc->cap_delay_lock);
541}
542
543/*
544 * Cancel delayed work on cap.
545 *
Sage Weilbe655592011-11-30 09:47:09 -0800546 * Caller must hold i_ceph_lock.
Sage Weila8599bd2009-10-06 11:31:12 -0700547 */
548static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
549 struct ceph_inode_info *ci)
550{
551 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
552 if (list_empty(&ci->i_cap_delay_list))
553 return;
554 spin_lock(&mdsc->cap_delay_lock);
555 list_del_init(&ci->i_cap_delay_list);
556 spin_unlock(&mdsc->cap_delay_lock);
557}
558
Jeff Layton785892f2020-01-02 07:11:38 -0500559/* Common issue checks for add_cap, handle_cap_grant. */
Sage Weila8599bd2009-10-06 11:31:12 -0700560static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
561 unsigned issued)
562{
563 unsigned had = __ceph_caps_issued(ci, NULL);
564
Jeff Layton785892f2020-01-02 07:11:38 -0500565 lockdep_assert_held(&ci->i_ceph_lock);
566
Sage Weila8599bd2009-10-06 11:31:12 -0700567 /*
568 * Each time we receive FILE_CACHE anew, we increment
569 * i_rdcache_gen.
570 */
Yan, Zheng525d15e2019-05-11 17:27:59 +0800571 if (S_ISREG(ci->vfs_inode.i_mode) &&
572 (issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
Milosz Tanski99ccbd22013-08-21 17:29:54 -0400573 (had & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0) {
Sage Weila8599bd2009-10-06 11:31:12 -0700574 ci->i_rdcache_gen++;
Milosz Tanski99ccbd22013-08-21 17:29:54 -0400575 }
Sage Weila8599bd2009-10-06 11:31:12 -0700576
577 /*
Yan, Zheng15b51bd2017-09-06 10:15:16 +0800578 * If FILE_SHARED is newly issued, mark dir not complete. We don't
579 * know what happened to this directory while we didn't have the cap.
580 * If FILE_SHARED is being revoked, also mark dir not complete. It
581 * stops on-going cached readdir.
Sage Weila8599bd2009-10-06 11:31:12 -0700582 */
Yan, Zheng15b51bd2017-09-06 10:15:16 +0800583 if ((issued & CEPH_CAP_FILE_SHARED) != (had & CEPH_CAP_FILE_SHARED)) {
584 if (issued & CEPH_CAP_FILE_SHARED)
Yan, Zheng97aeb6b2017-11-27 10:47:46 +0800585 atomic_inc(&ci->i_shared_gen);
Yan, Zhenga8673d62013-02-18 16:38:14 +0800586 if (S_ISDIR(ci->vfs_inode.i_mode)) {
587 dout(" marking %p NOT complete\n", &ci->vfs_inode);
Yan, Zheng2f276c52013-03-13 19:44:32 +0800588 __ceph_dir_clear_complete(ci);
Yan, Zhenga8673d62013-02-18 16:38:14 +0800589 }
Sage Weila8599bd2009-10-06 11:31:12 -0700590 }
Jeff Layton785892f2020-01-02 07:11:38 -0500591
592 /* Wipe saved layout if we're losing DIR_CREATE caps */
593 if (S_ISDIR(ci->vfs_inode.i_mode) && (had & CEPH_CAP_DIR_CREATE) &&
594 !(issued & CEPH_CAP_DIR_CREATE)) {
595 ceph_put_string(rcu_dereference_raw(ci->i_cached_layout.pool_ns));
596 memset(&ci->i_cached_layout, 0, sizeof(ci->i_cached_layout));
597 }
Sage Weila8599bd2009-10-06 11:31:12 -0700598}
599
600/*
601 * Add a capability under the given MDS session.
602 *
Jeff Layton354c63a2019-07-19 09:41:02 -0400603 * Caller should hold session snap_rwsem (read) and ci->i_ceph_lock
Sage Weila8599bd2009-10-06 11:31:12 -0700604 *
605 * @fmode is the open file mode, if we are opening a file, otherwise
606 * it is < 0. (This is so we can atomically add the cap and add an
607 * open file reference to it.)
608 */
Yan, Zhengd9df2782014-04-18 09:57:11 +0800609void ceph_add_cap(struct inode *inode,
610 struct ceph_mds_session *session, u64 cap_id,
Yan, Zheng135e6712020-03-05 20:21:02 +0800611 unsigned issued, unsigned wanted,
Yan, Zhengd9df2782014-04-18 09:57:11 +0800612 unsigned seq, unsigned mseq, u64 realmino, int flags,
613 struct ceph_cap **new_cap)
Sage Weila8599bd2009-10-06 11:31:12 -0700614{
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700615 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -0700616 struct ceph_inode_info *ci = ceph_inode(inode);
Sage Weila8599bd2009-10-06 11:31:12 -0700617 struct ceph_cap *cap;
618 int mds = session->s_mds;
619 int actual_wanted;
Jeff Layton606d1022019-07-22 13:12:01 -0400620 u32 gen;
Sage Weila8599bd2009-10-06 11:31:12 -0700621
Jeff Layton354c63a2019-07-19 09:41:02 -0400622 lockdep_assert_held(&ci->i_ceph_lock);
623
Sage Weila8599bd2009-10-06 11:31:12 -0700624 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
625 session->s_mds, cap_id, ceph_cap_string(issued), seq);
626
Jeff Layton606d1022019-07-22 13:12:01 -0400627 spin_lock(&session->s_gen_ttl_lock);
628 gen = session->s_cap_gen;
629 spin_unlock(&session->s_gen_ttl_lock);
630
Sage Weila8599bd2009-10-06 11:31:12 -0700631 cap = __get_cap_for_mds(ci, mds);
632 if (!cap) {
Yan, Zhengd9df2782014-04-18 09:57:11 +0800633 cap = *new_cap;
634 *new_cap = NULL;
Sage Weila8599bd2009-10-06 11:31:12 -0700635
636 cap->issued = 0;
637 cap->implemented = 0;
638 cap->mds = mds;
639 cap->mds_wanted = 0;
Yan, Zheng964266c2013-02-27 09:26:09 +0800640 cap->mseq = 0;
Sage Weila8599bd2009-10-06 11:31:12 -0700641
642 cap->ci = ci;
643 __insert_cap_node(ci, cap);
644
Sage Weila8599bd2009-10-06 11:31:12 -0700645 /* add to session cap list */
646 cap->session = session;
647 spin_lock(&session->s_cap_lock);
648 list_add_tail(&cap->session_caps, &session->s_caps);
649 session->s_nr_caps++;
650 spin_unlock(&session->s_cap_lock);
Yan, Zheng11df2df2013-11-24 14:44:38 +0800651 } else {
Yan, Zheng32f65112019-01-23 11:20:00 +0800652 spin_lock(&session->s_cap_lock);
653 list_move_tail(&cap->session_caps, &session->s_caps);
654 spin_unlock(&session->s_cap_lock);
655
Jeff Layton606d1022019-07-22 13:12:01 -0400656 if (cap->cap_gen < gen)
Yan, Zhengd2f8bb22018-12-10 16:35:09 +0800657 cap->issued = cap->implemented = CEPH_CAP_PIN;
658
Yan, Zheng11df2df2013-11-24 14:44:38 +0800659 /*
660 * auth mds of the inode changed. we received the cap export
661 * message, but still haven't received the cap import message.
662 * handle_cap_export() updated the new auth MDS' cap.
663 *
664 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
665 * a message that was send before the cap import message. So
666 * don't remove caps.
667 */
668 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
669 WARN_ON(cap != ci->i_auth_cap);
670 WARN_ON(cap->cap_id != cap_id);
671 seq = cap->seq;
672 mseq = cap->mseq;
673 issued |= cap->issued;
674 flags |= CEPH_CAP_FLAG_AUTH;
675 }
676 }
Sage Weila8599bd2009-10-06 11:31:12 -0700677
Yan, Zheng7d9c9192017-12-19 18:00:54 +0800678 if (!ci->i_snap_realm ||
679 ((flags & CEPH_CAP_FLAG_AUTH) &&
680 realmino != (u64)-1 && ci->i_snap_realm->ino != realmino)) {
Sage Weila8599bd2009-10-06 11:31:12 -0700681 /*
682 * add this inode to the appropriate snap realm
683 */
684 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
685 realmino);
686 if (realm) {
Yan, Zheng7d9c9192017-12-19 18:00:54 +0800687 struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
688 if (oldrealm) {
689 spin_lock(&oldrealm->inodes_with_caps_lock);
690 list_del_init(&ci->i_snap_realm_item);
691 spin_unlock(&oldrealm->inodes_with_caps_lock);
692 }
693
Sage Weila8599bd2009-10-06 11:31:12 -0700694 spin_lock(&realm->inodes_with_caps_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700695 list_add(&ci->i_snap_realm_item,
696 &realm->inodes_with_caps);
Luis Henriquese3161f12018-01-12 17:19:28 +0000697 ci->i_snap_realm = realm;
698 if (realm->ino == ci->i_vino.ino)
699 realm->inode = inode;
Sage Weila8599bd2009-10-06 11:31:12 -0700700 spin_unlock(&realm->inodes_with_caps_lock);
Yan, Zheng7d9c9192017-12-19 18:00:54 +0800701
702 if (oldrealm)
703 ceph_put_snap_realm(mdsc, oldrealm);
Sage Weila8599bd2009-10-06 11:31:12 -0700704 } else {
705 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
706 realmino);
Sage Weilb8cd07e2010-07-16 12:00:02 -0700707 WARN_ON(!realm);
Sage Weila8599bd2009-10-06 11:31:12 -0700708 }
709 }
710
711 __check_cap_issue(ci, cap, issued);
712
713 /*
714 * If we are issued caps we don't want, or the mds' wanted
715 * value appears to be off, queue a check so we'll release
716 * later and/or update the mds wanted value.
717 */
718 actual_wanted = __ceph_caps_wanted(ci);
719 if ((wanted & ~actual_wanted) ||
720 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
721 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
722 ceph_cap_string(issued), ceph_cap_string(wanted),
723 ceph_cap_string(actual_wanted));
Yan, Zhenga0d93e32020-03-05 20:21:01 +0800724 __cap_delay_requeue(mdsc, ci);
Sage Weila8599bd2009-10-06 11:31:12 -0700725 }
726
Yan, Zhengb8c2f3a2013-05-31 16:37:11 +0800727 if (flags & CEPH_CAP_FLAG_AUTH) {
Markus Elfringd37b1d92017-08-20 20:22:02 +0200728 if (!ci->i_auth_cap ||
Yan, Zhengd9ffc4f2014-03-18 10:15:29 +0800729 ceph_seq_cmp(ci->i_auth_cap->mseq, mseq) < 0) {
Yan, Zhengb8c2f3a2013-05-31 16:37:11 +0800730 ci->i_auth_cap = cap;
Yan, Zhengd9ffc4f2014-03-18 10:15:29 +0800731 cap->mds_wanted = wanted;
732 }
Yan, Zheng11df2df2013-11-24 14:44:38 +0800733 } else {
734 WARN_ON(ci->i_auth_cap == cap);
Yan, Zheng8a92a112013-01-04 14:28:07 +0800735 }
Sage Weila8599bd2009-10-06 11:31:12 -0700736
737 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
738 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
739 ceph_cap_string(issued|cap->issued), seq, mds);
740 cap->cap_id = cap_id;
741 cap->issued = issued;
742 cap->implemented |= issued;
Yan, Zhengd1b87802013-11-13 14:47:19 +0800743 if (ceph_seq_cmp(mseq, cap->mseq) > 0)
Yan, Zheng964266c2013-02-27 09:26:09 +0800744 cap->mds_wanted = wanted;
745 else
746 cap->mds_wanted |= wanted;
Sage Weila8599bd2009-10-06 11:31:12 -0700747 cap->seq = seq;
748 cap->issue_seq = seq;
749 cap->mseq = mseq;
Jeff Layton606d1022019-07-22 13:12:01 -0400750 cap->cap_gen = gen;
Sage Weila8599bd2009-10-06 11:31:12 -0700751}
752
753/*
754 * Return true if cap has not timed out and belongs to the current
755 * generation of the MDS session (i.e. has not gone 'stale' due to
756 * us losing touch with the mds).
757 */
758static int __cap_is_valid(struct ceph_cap *cap)
759{
760 unsigned long ttl;
Sage Weilcdac8302009-11-10 16:02:23 -0800761 u32 gen;
Sage Weila8599bd2009-10-06 11:31:12 -0700762
Alex Elderd8fb02a2012-01-12 17:48:10 -0800763 spin_lock(&cap->session->s_gen_ttl_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700764 gen = cap->session->s_cap_gen;
765 ttl = cap->session->s_cap_ttl;
Alex Elderd8fb02a2012-01-12 17:48:10 -0800766 spin_unlock(&cap->session->s_gen_ttl_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700767
Sage Weil685f9a5d2009-11-09 12:05:48 -0800768 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
Sage Weila8599bd2009-10-06 11:31:12 -0700769 dout("__cap_is_valid %p cap %p issued %s "
770 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
Sage Weil685f9a5d2009-11-09 12:05:48 -0800771 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
Sage Weila8599bd2009-10-06 11:31:12 -0700772 return 0;
773 }
774
775 return 1;
776}
777
778/*
779 * Return set of valid cap bits issued to us. Note that caps time
780 * out, and may be invalidated in bulk if the client session times out
781 * and session->s_cap_gen is bumped.
782 */
783int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
784{
Yan, Zhengd9df2782014-04-18 09:57:11 +0800785 int have = ci->i_snap_caps;
Sage Weila8599bd2009-10-06 11:31:12 -0700786 struct ceph_cap *cap;
787 struct rb_node *p;
788
789 if (implemented)
790 *implemented = 0;
791 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
792 cap = rb_entry(p, struct ceph_cap, ci_node);
793 if (!__cap_is_valid(cap))
794 continue;
795 dout("__ceph_caps_issued %p cap %p issued %s\n",
796 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
797 have |= cap->issued;
798 if (implemented)
799 *implemented |= cap->implemented;
800 }
Yan, Zhengb1530f52013-07-02 12:40:20 +0800801 /*
802 * exclude caps issued by non-auth MDS, but are been revoking
803 * by the auth MDS. The non-auth MDS should be revoking/exporting
804 * these caps, but the message is delayed.
805 */
806 if (ci->i_auth_cap) {
807 cap = ci->i_auth_cap;
808 have &= ~cap->implemented | cap->issued;
809 }
Sage Weila8599bd2009-10-06 11:31:12 -0700810 return have;
811}
812
813/*
814 * Get cap bits issued by caps other than @ocap
815 */
816int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
817{
818 int have = ci->i_snap_caps;
819 struct ceph_cap *cap;
820 struct rb_node *p;
821
822 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
823 cap = rb_entry(p, struct ceph_cap, ci_node);
824 if (cap == ocap)
825 continue;
826 if (!__cap_is_valid(cap))
827 continue;
828 have |= cap->issued;
829 }
830 return have;
831}
832
833/*
834 * Move a cap to the end of the LRU (oldest caps at list head, newest
835 * at list tail).
836 */
837static void __touch_cap(struct ceph_cap *cap)
838{
839 struct ceph_mds_session *s = cap->session;
840
Sage Weila8599bd2009-10-06 11:31:12 -0700841 spin_lock(&s->s_cap_lock);
Markus Elfringd37b1d92017-08-20 20:22:02 +0200842 if (!s->s_cap_iterator) {
Sage Weil5dacf092009-12-21 20:40:34 -0800843 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
844 s->s_mds);
845 list_move_tail(&cap->session_caps, &s->s_caps);
846 } else {
847 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
848 &cap->ci->vfs_inode, cap, s->s_mds);
849 }
Sage Weila8599bd2009-10-06 11:31:12 -0700850 spin_unlock(&s->s_cap_lock);
851}
852
853/*
854 * Check if we hold the given mask. If so, move the cap(s) to the
855 * front of their respective LRUs. (This is the preferred way for
856 * callers to check for caps they want.)
857 */
858int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
859{
860 struct ceph_cap *cap;
861 struct rb_node *p;
862 int have = ci->i_snap_caps;
863
864 if ((have & mask) == mask) {
Jeff Layton5ddc61f2019-04-23 13:40:02 -0400865 dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
866 " (mask %s)\n", ci->vfs_inode.i_ino,
Sage Weila8599bd2009-10-06 11:31:12 -0700867 ceph_cap_string(have),
868 ceph_cap_string(mask));
869 return 1;
870 }
871
872 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
873 cap = rb_entry(p, struct ceph_cap, ci_node);
874 if (!__cap_is_valid(cap))
875 continue;
876 if ((cap->issued & mask) == mask) {
Jeff Layton5ddc61f2019-04-23 13:40:02 -0400877 dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
878 " (mask %s)\n", ci->vfs_inode.i_ino, cap,
Sage Weila8599bd2009-10-06 11:31:12 -0700879 ceph_cap_string(cap->issued),
880 ceph_cap_string(mask));
881 if (touch)
882 __touch_cap(cap);
883 return 1;
884 }
885
886 /* does a combination of caps satisfy mask? */
887 have |= cap->issued;
888 if ((have & mask) == mask) {
Jeff Layton5ddc61f2019-04-23 13:40:02 -0400889 dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
890 " (mask %s)\n", ci->vfs_inode.i_ino,
Sage Weila8599bd2009-10-06 11:31:12 -0700891 ceph_cap_string(cap->issued),
892 ceph_cap_string(mask));
893 if (touch) {
894 struct rb_node *q;
895
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300896 /* touch this + preceding caps */
Sage Weila8599bd2009-10-06 11:31:12 -0700897 __touch_cap(cap);
898 for (q = rb_first(&ci->i_caps); q != p;
899 q = rb_next(q)) {
900 cap = rb_entry(q, struct ceph_cap,
901 ci_node);
902 if (!__cap_is_valid(cap))
903 continue;
Xiubo Li9f8b72b2019-12-16 00:12:07 -0500904 if (cap->issued & mask)
905 __touch_cap(cap);
Sage Weila8599bd2009-10-06 11:31:12 -0700906 }
907 }
908 return 1;
909 }
910 }
911
912 return 0;
913}
914
Xiubo Li1af16d52020-03-19 23:45:00 -0400915int __ceph_caps_issued_mask_metric(struct ceph_inode_info *ci, int mask,
916 int touch)
917{
918 struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
919 int r;
920
921 r = __ceph_caps_issued_mask(ci, mask, touch);
922 if (r)
923 ceph_update_cap_hit(&fsc->mdsc->metric);
924 else
925 ceph_update_cap_mis(&fsc->mdsc->metric);
926 return r;
927}
928
Sage Weila8599bd2009-10-06 11:31:12 -0700929/*
930 * Return true if mask caps are currently being revoked by an MDS.
931 */
Yan, Zheng6ee6b9532013-07-02 12:40:21 +0800932int __ceph_caps_revoking_other(struct ceph_inode_info *ci,
933 struct ceph_cap *ocap, int mask)
934{
935 struct ceph_cap *cap;
936 struct rb_node *p;
937
938 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
939 cap = rb_entry(p, struct ceph_cap, ci_node);
Yan, Zheng9563f882013-11-22 13:50:45 +0800940 if (cap != ocap &&
Yan, Zheng6ee6b9532013-07-02 12:40:21 +0800941 (cap->implemented & ~cap->issued & mask))
942 return 1;
943 }
944 return 0;
945}
946
Sage Weila8599bd2009-10-06 11:31:12 -0700947int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
948{
949 struct inode *inode = &ci->vfs_inode;
Yan, Zheng6ee6b9532013-07-02 12:40:21 +0800950 int ret;
Sage Weila8599bd2009-10-06 11:31:12 -0700951
Sage Weilbe655592011-11-30 09:47:09 -0800952 spin_lock(&ci->i_ceph_lock);
Yan, Zheng6ee6b9532013-07-02 12:40:21 +0800953 ret = __ceph_caps_revoking_other(ci, NULL, mask);
Sage Weilbe655592011-11-30 09:47:09 -0800954 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -0700955 dout("ceph_caps_revoking %p %s = %d\n", inode,
956 ceph_cap_string(mask), ret);
957 return ret;
958}
959
960int __ceph_caps_used(struct ceph_inode_info *ci)
961{
962 int used = 0;
963 if (ci->i_pin_ref)
964 used |= CEPH_CAP_PIN;
965 if (ci->i_rd_ref)
966 used |= CEPH_CAP_FILE_RD;
Yan, Zhengfdd4e152015-06-16 20:48:56 +0800967 if (ci->i_rdcache_ref ||
Yan, Zheng525d15e2019-05-11 17:27:59 +0800968 (S_ISREG(ci->vfs_inode.i_mode) &&
Yan, Zhengfdd4e152015-06-16 20:48:56 +0800969 ci->vfs_inode.i_data.nrpages))
Sage Weila8599bd2009-10-06 11:31:12 -0700970 used |= CEPH_CAP_FILE_CACHE;
971 if (ci->i_wr_ref)
972 used |= CEPH_CAP_FILE_WR;
Henry C Changd3d07202011-05-11 10:29:54 +0000973 if (ci->i_wb_ref || ci->i_wrbuffer_ref)
Sage Weila8599bd2009-10-06 11:31:12 -0700974 used |= CEPH_CAP_FILE_BUFFER;
Jeff Laytonf85122a2019-04-02 08:04:30 -0400975 if (ci->i_fx_ref)
976 used |= CEPH_CAP_FILE_EXCL;
Sage Weila8599bd2009-10-06 11:31:12 -0700977 return used;
978}
979
Yan, Zheng719a2512020-03-05 20:21:00 +0800980#define FMODE_WAIT_BIAS 1000
981
Sage Weila8599bd2009-10-06 11:31:12 -0700982/*
983 * wanted, by virtue of open file modes
984 */
985int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
986{
Yan, Zheng719a2512020-03-05 20:21:00 +0800987 const int PIN_SHIFT = ffs(CEPH_FILE_MODE_PIN);
988 const int RD_SHIFT = ffs(CEPH_FILE_MODE_RD);
989 const int WR_SHIFT = ffs(CEPH_FILE_MODE_WR);
990 const int LAZY_SHIFT = ffs(CEPH_FILE_MODE_LAZY);
991 struct ceph_mount_options *opt =
992 ceph_inode_to_client(&ci->vfs_inode)->mount_options;
993 unsigned long used_cutoff = jiffies - opt->caps_wanted_delay_max * HZ;
994 unsigned long idle_cutoff = jiffies - opt->caps_wanted_delay_min * HZ;
995
996 if (S_ISDIR(ci->vfs_inode.i_mode)) {
997 int want = 0;
998
999 /* use used_cutoff here, to keep dir's wanted caps longer */
1000 if (ci->i_nr_by_mode[RD_SHIFT] > 0 ||
1001 time_after(ci->i_last_rd, used_cutoff))
1002 want |= CEPH_CAP_ANY_SHARED;
1003
1004 if (ci->i_nr_by_mode[WR_SHIFT] > 0 ||
1005 time_after(ci->i_last_wr, used_cutoff)) {
1006 want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
1007 if (opt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)
1008 want |= CEPH_CAP_ANY_DIR_OPS;
1009 }
1010
1011 if (want || ci->i_nr_by_mode[PIN_SHIFT] > 0)
1012 want |= CEPH_CAP_PIN;
1013
1014 return want;
1015 } else {
1016 int bits = 0;
1017
1018 if (ci->i_nr_by_mode[RD_SHIFT] > 0) {
1019 if (ci->i_nr_by_mode[RD_SHIFT] >= FMODE_WAIT_BIAS ||
1020 time_after(ci->i_last_rd, used_cutoff))
1021 bits |= 1 << RD_SHIFT;
1022 } else if (time_after(ci->i_last_rd, idle_cutoff)) {
1023 bits |= 1 << RD_SHIFT;
1024 }
1025
1026 if (ci->i_nr_by_mode[WR_SHIFT] > 0) {
1027 if (ci->i_nr_by_mode[WR_SHIFT] >= FMODE_WAIT_BIAS ||
1028 time_after(ci->i_last_wr, used_cutoff))
1029 bits |= 1 << WR_SHIFT;
1030 } else if (time_after(ci->i_last_wr, idle_cutoff)) {
1031 bits |= 1 << WR_SHIFT;
1032 }
1033
1034 /* check lazyio only when read/write is wanted */
1035 if ((bits & (CEPH_FILE_MODE_RDWR << 1)) &&
1036 ci->i_nr_by_mode[LAZY_SHIFT] > 0)
1037 bits |= 1 << LAZY_SHIFT;
1038
1039 return bits ? ceph_caps_for_mode(bits >> 1) : 0;
Yan, Zheng774a6a12016-06-06 16:01:39 +08001040 }
Sage Weila8599bd2009-10-06 11:31:12 -07001041}
1042
1043/*
Yan, Zheng525d15e2019-05-11 17:27:59 +08001044 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
1045 */
1046int __ceph_caps_wanted(struct ceph_inode_info *ci)
1047{
1048 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
Jeff Laytona25949b2020-02-18 14:12:45 -05001049 if (S_ISDIR(ci->vfs_inode.i_mode)) {
1050 /* we want EXCL if holding caps of dir ops */
1051 if (w & CEPH_CAP_ANY_DIR_OPS)
1052 w |= CEPH_CAP_FILE_EXCL;
1053 } else {
Yan, Zheng525d15e2019-05-11 17:27:59 +08001054 /* we want EXCL if dirty data */
1055 if (w & CEPH_CAP_FILE_BUFFER)
1056 w |= CEPH_CAP_FILE_EXCL;
1057 }
1058 return w;
1059}
1060
1061/*
Sage Weila8599bd2009-10-06 11:31:12 -07001062 * Return caps we have registered with the MDS(s) as 'wanted'.
1063 */
Yan, Zhengc1944fe2017-01-29 22:15:47 +08001064int __ceph_caps_mds_wanted(struct ceph_inode_info *ci, bool check)
Sage Weila8599bd2009-10-06 11:31:12 -07001065{
1066 struct ceph_cap *cap;
1067 struct rb_node *p;
1068 int mds_wanted = 0;
1069
1070 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1071 cap = rb_entry(p, struct ceph_cap, ci_node);
Yan, Zhengc1944fe2017-01-29 22:15:47 +08001072 if (check && !__cap_is_valid(cap))
Sage Weila8599bd2009-10-06 11:31:12 -07001073 continue;
Yan, Zhenga2550602014-03-08 09:51:45 +08001074 if (cap == ci->i_auth_cap)
1075 mds_wanted |= cap->mds_wanted;
1076 else
1077 mds_wanted |= (cap->mds_wanted & ~CEPH_CAP_ANY_FILE_WR);
Sage Weila8599bd2009-10-06 11:31:12 -07001078 }
1079 return mds_wanted;
1080}
1081
Yan, Zheng9215aee2013-11-30 12:47:41 +08001082int ceph_is_any_caps(struct inode *inode)
1083{
1084 struct ceph_inode_info *ci = ceph_inode(inode);
1085 int ret;
1086
1087 spin_lock(&ci->i_ceph_lock);
Xiubo Libd84fbc2019-12-03 03:00:51 -05001088 ret = __ceph_is_any_real_caps(ci);
Yan, Zheng9215aee2013-11-30 12:47:41 +08001089 spin_unlock(&ci->i_ceph_lock);
1090
1091 return ret;
1092}
1093
Yan, Zhengdb40cc12015-03-23 20:12:20 +08001094static void drop_inode_snap_realm(struct ceph_inode_info *ci)
1095{
1096 struct ceph_snap_realm *realm = ci->i_snap_realm;
1097 spin_lock(&realm->inodes_with_caps_lock);
1098 list_del_init(&ci->i_snap_realm_item);
1099 ci->i_snap_realm_counter++;
1100 ci->i_snap_realm = NULL;
Yan, Zhengd95e6742019-01-10 15:41:09 +08001101 if (realm->ino == ci->i_vino.ino)
1102 realm->inode = NULL;
Yan, Zhengdb40cc12015-03-23 20:12:20 +08001103 spin_unlock(&realm->inodes_with_caps_lock);
1104 ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
1105 realm);
1106}
1107
Sage Weila8599bd2009-10-06 11:31:12 -07001108/*
Sage Weilf818a732010-05-11 20:56:31 -07001109 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
1110 *
Sage Weilbe655592011-11-30 09:47:09 -08001111 * caller should hold i_ceph_lock.
Sage Weila6369742010-02-22 13:59:00 -08001112 * caller will not hold session s_mutex if called from destroy_inode.
Sage Weila8599bd2009-10-06 11:31:12 -07001113 */
Yan, Zhenga096b092013-09-22 10:15:58 +08001114void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
Sage Weila8599bd2009-10-06 11:31:12 -07001115{
1116 struct ceph_mds_session *session = cap->session;
1117 struct ceph_inode_info *ci = cap->ci;
Cheng Renquan640ef792010-03-26 17:40:33 +08001118 struct ceph_mds_client *mdsc =
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07001119 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
Sage Weilf818a732010-05-11 20:56:31 -07001120 int removed = 0;
Sage Weila8599bd2009-10-06 11:31:12 -07001121
1122 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
1123
Luis Henriquesea60ed62019-10-25 14:05:24 +01001124 /* remove from inode's cap rbtree, and clear auth cap */
1125 rb_erase(&cap->ci_node, &ci->i_caps);
1126 if (ci->i_auth_cap == cap)
1127 ci->i_auth_cap = NULL;
1128
Sage Weil7c1332b2010-02-16 11:39:45 -08001129 /* remove from session list */
1130 spin_lock(&session->s_cap_lock);
1131 if (session->s_cap_iterator == cap) {
1132 /* not yet, we are iterating over this very cap */
1133 dout("__ceph_remove_cap delaying %p removal from session %p\n",
1134 cap, cap->session);
1135 } else {
1136 list_del_init(&cap->session_caps);
1137 session->s_nr_caps--;
1138 cap->session = NULL;
Sage Weilf818a732010-05-11 20:56:31 -07001139 removed = 1;
Sage Weil7c1332b2010-02-16 11:39:45 -08001140 }
Sage Weilf818a732010-05-11 20:56:31 -07001141 /* protect backpointer with s_cap_lock: see iterate_session_caps */
1142 cap->ci = NULL;
Yan, Zheng745a8e32015-05-14 17:22:42 +08001143
1144 /*
1145 * s_cap_reconnect is protected by s_cap_lock. no one changes
1146 * s_cap_gen while session is in the reconnect state.
1147 */
1148 if (queue_release &&
1149 (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) {
1150 cap->queue_release = 1;
1151 if (removed) {
Yan, Zhenge3ec8d62019-01-14 17:21:19 +08001152 __ceph_queue_cap_release(session, cap);
Yan, Zheng745a8e32015-05-14 17:22:42 +08001153 removed = 0;
1154 }
1155 } else {
1156 cap->queue_release = 0;
1157 }
1158 cap->cap_ino = ci->i_vino.ino;
1159
Sage Weil7c1332b2010-02-16 11:39:45 -08001160 spin_unlock(&session->s_cap_lock);
1161
Sage Weilf818a732010-05-11 20:56:31 -07001162 if (removed)
Yehuda Sadeh37151662010-06-17 16:16:12 -07001163 ceph_put_cap(mdsc, cap);
Sage Weila8599bd2009-10-06 11:31:12 -07001164
Xiubo Libd84fbc2019-12-03 03:00:51 -05001165 if (!__ceph_is_any_real_caps(ci)) {
1166 /* when reconnect denied, we remove session caps forcibly,
1167 * i_wr_ref can be non-zero. If there are ongoing write,
1168 * keep i_snap_realm.
1169 */
1170 if (ci->i_wr_ref == 0 && ci->i_snap_realm)
1171 drop_inode_snap_realm(ci);
Yan, Zhengdb40cc12015-03-23 20:12:20 +08001172
Sage Weila8599bd2009-10-06 11:31:12 -07001173 __cap_delay_cancel(mdsc, ci);
Xiubo Libd84fbc2019-12-03 03:00:51 -05001174 }
Sage Weila8599bd2009-10-06 11:31:12 -07001175}
1176
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001177struct cap_msg_args {
1178 struct ceph_mds_session *session;
1179 u64 ino, cid, follows;
1180 u64 flush_tid, oldest_flush_tid, size, max_size;
1181 u64 xattr_version;
Jeff Layton176c77c2019-06-06 08:06:40 -04001182 u64 change_attr;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001183 struct ceph_buffer *xattr_buf;
Jeff Layton0a454bd2020-03-17 08:47:31 -04001184 struct ceph_buffer *old_xattr_buf;
Jeff Laytonec62b892019-05-29 12:23:14 -04001185 struct timespec64 atime, mtime, ctime, btime;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001186 int op, caps, wanted, dirty;
1187 u32 seq, issue_seq, mseq, time_warp_seq;
Jeff Layton1e4ef0c2016-11-10 07:42:06 -05001188 u32 flags;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001189 kuid_t uid;
1190 kgid_t gid;
1191 umode_t mode;
1192 bool inline_data;
Jeff Layton0a454bd2020-03-17 08:47:31 -04001193 bool wake;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001194};
1195
Sage Weila8599bd2009-10-06 11:31:12 -07001196/*
1197 * Build and send a cap message to the given MDS.
1198 *
1199 * Caller should be holding s_mutex.
1200 */
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001201static int send_cap_msg(struct cap_msg_args *arg)
Sage Weila8599bd2009-10-06 11:31:12 -07001202{
1203 struct ceph_mds_caps *fc;
1204 struct ceph_msg *msg;
Yan, Zhenge20d2582014-11-14 22:39:13 +08001205 void *p;
1206 size_t extra_len;
Jeff Layton92475f02017-04-13 11:07:04 -04001207 struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc;
Sage Weila8599bd2009-10-06 11:31:12 -07001208
1209 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
Yan, Zhenga2971c82015-06-10 11:09:32 +08001210 " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001211 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
1212 arg->cid, arg->ino, ceph_cap_string(arg->caps),
1213 ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
1214 arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
1215 arg->mseq, arg->follows, arg->size, arg->max_size,
1216 arg->xattr_version,
1217 arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
Sage Weila8599bd2009-10-06 11:31:12 -07001218
Yan, Zhenga2971c82015-06-10 11:09:32 +08001219 /* flock buffer size + inline version + inline data size +
1220 * osd_epoch_barrier + oldest_flush_tid */
Jeff Layton43b29672016-11-10 07:42:05 -05001221 extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
Yan, Zhenge20d2582014-11-14 22:39:13 +08001222 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
1223 GFP_NOFS, false);
Sage Weila79832f2010-04-01 16:06:19 -07001224 if (!msg)
1225 return -ENOMEM;
Sage Weila8599bd2009-10-06 11:31:12 -07001226
Jeff Layton43b29672016-11-10 07:42:05 -05001227 msg->hdr.version = cpu_to_le16(10);
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001228 msg->hdr.tid = cpu_to_le64(arg->flush_tid);
Sage Weila8599bd2009-10-06 11:31:12 -07001229
Sage Weil6df058c2009-12-22 11:24:33 -08001230 fc = msg->front.iov_base;
Sage Weila8599bd2009-10-06 11:31:12 -07001231 memset(fc, 0, sizeof(*fc));
1232
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001233 fc->cap_id = cpu_to_le64(arg->cid);
1234 fc->op = cpu_to_le32(arg->op);
1235 fc->seq = cpu_to_le32(arg->seq);
1236 fc->issue_seq = cpu_to_le32(arg->issue_seq);
1237 fc->migrate_seq = cpu_to_le32(arg->mseq);
1238 fc->caps = cpu_to_le32(arg->caps);
1239 fc->wanted = cpu_to_le32(arg->wanted);
1240 fc->dirty = cpu_to_le32(arg->dirty);
1241 fc->ino = cpu_to_le64(arg->ino);
1242 fc->snap_follows = cpu_to_le64(arg->follows);
Sage Weila8599bd2009-10-06 11:31:12 -07001243
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001244 fc->size = cpu_to_le64(arg->size);
1245 fc->max_size = cpu_to_le64(arg->max_size);
Arnd Bergmann9bbeab42018-07-13 22:18:36 +02001246 ceph_encode_timespec64(&fc->mtime, &arg->mtime);
1247 ceph_encode_timespec64(&fc->atime, &arg->atime);
1248 ceph_encode_timespec64(&fc->ctime, &arg->ctime);
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001249 fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
Sage Weila8599bd2009-10-06 11:31:12 -07001250
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001251 fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
1252 fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
1253 fc->mode = cpu_to_le32(arg->mode);
Sage Weila8599bd2009-10-06 11:31:12 -07001254
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001255 fc->xattr_version = cpu_to_le64(arg->xattr_version);
1256 if (arg->xattr_buf) {
1257 msg->middle = ceph_buffer_get(arg->xattr_buf);
1258 fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
1259 msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
Jeff Layton96700792016-11-10 07:42:02 -05001260 }
1261
Yan, Zhenge20d2582014-11-14 22:39:13 +08001262 p = fc + 1;
Jeff Layton43b29672016-11-10 07:42:05 -05001263 /* flock buffer size (version 2) */
Yan, Zhenge20d2582014-11-14 22:39:13 +08001264 ceph_encode_32(&p, 0);
Jeff Layton43b29672016-11-10 07:42:05 -05001265 /* inline version (version 4) */
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001266 ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
Yan, Zhenge20d2582014-11-14 22:39:13 +08001267 /* inline data size */
1268 ceph_encode_32(&p, 0);
Jeff Layton92475f02017-04-13 11:07:04 -04001269 /*
1270 * osd_epoch_barrier (version 5)
1271 * The epoch_barrier is protected osdc->lock, so READ_ONCE here in
1272 * case it was recently changed
1273 */
1274 ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier));
Jeff Layton43b29672016-11-10 07:42:05 -05001275 /* oldest_flush_tid (version 6) */
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001276 ceph_encode_64(&p, arg->oldest_flush_tid);
Yan, Zhenge20d2582014-11-14 22:39:13 +08001277
Jeff Layton43b29672016-11-10 07:42:05 -05001278 /*
1279 * caller_uid/caller_gid (version 7)
1280 *
1281 * Currently, we don't properly track which caller dirtied the caps
1282 * last, and force a flush of them when there is a conflict. For now,
1283 * just set this to 0:0, to emulate how the MDS has worked up to now.
1284 */
1285 ceph_encode_32(&p, 0);
1286 ceph_encode_32(&p, 0);
1287
1288 /* pool namespace (version 8) (mds always ignores this) */
1289 ceph_encode_32(&p, 0);
1290
Jeff Layton176c77c2019-06-06 08:06:40 -04001291 /* btime and change_attr (version 9) */
Jeff Laytonec62b892019-05-29 12:23:14 -04001292 ceph_encode_timespec64(p, &arg->btime);
Jeff Layton43b29672016-11-10 07:42:05 -05001293 p += sizeof(struct ceph_timespec);
Jeff Layton176c77c2019-06-06 08:06:40 -04001294 ceph_encode_64(&p, arg->change_attr);
Jeff Layton43b29672016-11-10 07:42:05 -05001295
1296 /* Advisory flags (version 10) */
Jeff Layton1e4ef0c2016-11-10 07:42:06 -05001297 ceph_encode_32(&p, arg->flags);
Jeff Layton43b29672016-11-10 07:42:05 -05001298
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001299 ceph_con_send(&arg->session->s_con, msg);
Sage Weila8599bd2009-10-06 11:31:12 -07001300 return 0;
1301}
1302
1303/*
Yan, Zhengd6e47812019-05-23 11:01:37 +08001304 * Queue cap releases when an inode is dropped from our cache.
Sage Weila8599bd2009-10-06 11:31:12 -07001305 */
Yan, Zhengd6e47812019-05-23 11:01:37 +08001306void __ceph_remove_caps(struct ceph_inode_info *ci)
Sage Weila8599bd2009-10-06 11:31:12 -07001307{
Sage Weila8599bd2009-10-06 11:31:12 -07001308 struct rb_node *p;
1309
Yan, Zhengd6e47812019-05-23 11:01:37 +08001310 /* lock i_ceph_lock, because ceph_d_revalidate(..., LOOKUP_RCU)
1311 * may call __ceph_caps_issued_mask() on a freeing inode. */
1312 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07001313 p = rb_first(&ci->i_caps);
1314 while (p) {
1315 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
Sage Weila8599bd2009-10-06 11:31:12 -07001316 p = rb_next(p);
Yan, Zhenga096b092013-09-22 10:15:58 +08001317 __ceph_remove_cap(cap, true);
Sage Weila8599bd2009-10-06 11:31:12 -07001318 }
Yan, Zhengd6e47812019-05-23 11:01:37 +08001319 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07001320}
1321
1322/*
Jeff Layton0a454bd2020-03-17 08:47:31 -04001323 * Prepare to send a cap message to an MDS. Update the cap state, and populate
1324 * the arg struct with the parameters that will need to be sent. This should
1325 * be done under the i_ceph_lock to guard against changes to cap state.
Sage Weila8599bd2009-10-06 11:31:12 -07001326 *
1327 * Make note of max_size reported/requested from mds, revoked caps
1328 * that have now been implemented.
Sage Weila8599bd2009-10-06 11:31:12 -07001329 */
Jeff Layton0a454bd2020-03-17 08:47:31 -04001330static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
1331 int op, int flags, int used, int want, int retain,
1332 int flushing, u64 flush_tid, u64 oldest_flush_tid)
Sage Weila8599bd2009-10-06 11:31:12 -07001333{
1334 struct ceph_inode_info *ci = cap->ci;
1335 struct inode *inode = &ci->vfs_inode;
Colin Ian Kingbb0581f2017-10-18 12:34:25 +01001336 int held, revoking;
Sage Weila8599bd2009-10-06 11:31:12 -07001337
Jeff Layton0a454bd2020-03-17 08:47:31 -04001338 lockdep_assert_held(&ci->i_ceph_lock);
Jeff Layton891f3f52020-01-14 15:06:40 -05001339
Sage Weil68c28322010-02-09 13:41:47 -08001340 held = cap->issued | cap->implemented;
1341 revoking = cap->implemented & ~cap->issued;
1342 retain &= ~revoking;
Sage Weil68c28322010-02-09 13:41:47 -08001343
Jeff Layton0a454bd2020-03-17 08:47:31 -04001344 dout("%s %p cap %p session %p %s -> %s (revoking %s)\n",
1345 __func__, inode, cap, cap->session,
Sage Weila8599bd2009-10-06 11:31:12 -07001346 ceph_cap_string(held), ceph_cap_string(held & retain),
1347 ceph_cap_string(revoking));
1348 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1349
Yan, Zhenga0d93e32020-03-05 20:21:01 +08001350 ci->i_ceph_flags &= ~CEPH_I_FLUSH;
Sage Weila8599bd2009-10-06 11:31:12 -07001351
1352 cap->issued &= retain; /* drop bits we don't want */
Jeff Layton0a454bd2020-03-17 08:47:31 -04001353 /*
1354 * Wake up any waiters on wanted -> needed transition. This is due to
1355 * the weird transition from buffered to sync IO... we need to flush
1356 * dirty pages _before_ allowing sync writes to avoid reordering.
1357 */
1358 arg->wake = cap->implemented & ~cap->issued;
Sage Weila8599bd2009-10-06 11:31:12 -07001359 cap->implemented &= cap->issued | used;
1360 cap->mds_wanted = want;
1361
Jeff Layton0a454bd2020-03-17 08:47:31 -04001362 arg->session = cap->session;
1363 arg->ino = ceph_vino(inode).ino;
1364 arg->cid = cap->cap_id;
1365 arg->follows = flushing ? ci->i_head_snapc->seq : 0;
1366 arg->flush_tid = flush_tid;
1367 arg->oldest_flush_tid = oldest_flush_tid;
Sage Weila8599bd2009-10-06 11:31:12 -07001368
Jeff Layton0a454bd2020-03-17 08:47:31 -04001369 arg->size = inode->i_size;
1370 ci->i_reported_size = arg->size;
1371 arg->max_size = ci->i_wanted_max_size;
Yan, Zheng11ba6b92020-03-05 20:21:03 +08001372 if (cap == ci->i_auth_cap)
Jeff Layton0a454bd2020-03-17 08:47:31 -04001373 ci->i_requested_max_size = arg->max_size;
Sage Weila8599bd2009-10-06 11:31:12 -07001374
Sage Weil082afec2010-08-22 15:16:41 -07001375 if (flushing & CEPH_CAP_XATTR_EXCL) {
Jeff Layton0a454bd2020-03-17 08:47:31 -04001376 arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
1377 arg->xattr_version = ci->i_xattrs.version;
1378 arg->xattr_buf = ci->i_xattrs.blob;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001379 } else {
Jeff Layton0a454bd2020-03-17 08:47:31 -04001380 arg->xattr_buf = NULL;
1381 arg->old_xattr_buf = NULL;
Sage Weila8599bd2009-10-06 11:31:12 -07001382 }
1383
Jeff Layton0a454bd2020-03-17 08:47:31 -04001384 arg->mtime = inode->i_mtime;
1385 arg->atime = inode->i_atime;
1386 arg->ctime = inode->i_ctime;
1387 arg->btime = ci->i_btime;
1388 arg->change_attr = inode_peek_iversion_raw(inode);
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001389
Jeff Layton0a454bd2020-03-17 08:47:31 -04001390 arg->op = op;
1391 arg->caps = cap->implemented;
1392 arg->wanted = want;
1393 arg->dirty = flushing;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001394
Jeff Layton0a454bd2020-03-17 08:47:31 -04001395 arg->seq = cap->seq;
1396 arg->issue_seq = cap->issue_seq;
1397 arg->mseq = cap->mseq;
1398 arg->time_warp_seq = ci->i_time_warp_seq;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001399
Jeff Layton0a454bd2020-03-17 08:47:31 -04001400 arg->uid = inode->i_uid;
1401 arg->gid = inode->i_gid;
1402 arg->mode = inode->i_mode;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001403
Jeff Layton0a454bd2020-03-17 08:47:31 -04001404 arg->inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
Yan, Zheng49ada6e2019-06-20 12:09:08 +08001405 if (!(flags & CEPH_CLIENT_CAPS_PENDING_CAPSNAP) &&
1406 !list_empty(&ci->i_cap_snaps)) {
1407 struct ceph_cap_snap *capsnap;
1408 list_for_each_entry_reverse(capsnap, &ci->i_cap_snaps, ci_item) {
1409 if (capsnap->cap_flush.tid)
1410 break;
1411 if (capsnap->need_flush) {
1412 flags |= CEPH_CLIENT_CAPS_PENDING_CAPSNAP;
1413 break;
1414 }
1415 }
1416 }
Jeff Layton0a454bd2020-03-17 08:47:31 -04001417 arg->flags = flags;
1418}
Yan, Zhenge20d2582014-11-14 22:39:13 +08001419
Jeff Layton0a454bd2020-03-17 08:47:31 -04001420/*
1421 * Send a cap msg on the given inode.
1422 *
1423 * Caller should hold snap_rwsem (read), s_mutex.
1424 */
1425static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg,
1426 struct ceph_inode_info *ci)
1427{
1428 struct inode *inode = &ci->vfs_inode;
1429 int ret;
Sage Weila8599bd2009-10-06 11:31:12 -07001430
Jeff Layton0a454bd2020-03-17 08:47:31 -04001431 ret = send_cap_msg(arg);
Sage Weila8599bd2009-10-06 11:31:12 -07001432 if (ret < 0) {
Yan, Zhenga0d93e32020-03-05 20:21:01 +08001433 pr_err("error sending cap msg, ino (%llx.%llx) "
1434 "flushing %s tid %llu, requeue\n",
Jeff Layton0a454bd2020-03-17 08:47:31 -04001435 ceph_vinop(inode), ceph_cap_string(arg->dirty),
1436 arg->flush_tid);
Yan, Zhenga0d93e32020-03-05 20:21:01 +08001437 spin_lock(&ci->i_ceph_lock);
1438 __cap_delay_requeue(mdsc, ci);
1439 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07001440 }
1441
Jeff Layton0a454bd2020-03-17 08:47:31 -04001442 ceph_buffer_put(arg->old_xattr_buf);
Sage Weila8599bd2009-10-06 11:31:12 -07001443
Jeff Layton0a454bd2020-03-17 08:47:31 -04001444 if (arg->wake)
1445 wake_up_all(&ci->i_cap_wq);
Sage Weila8599bd2009-10-06 11:31:12 -07001446}
1447
Yan, Zheng0e294382016-07-04 18:06:41 +08001448static inline int __send_flush_snap(struct inode *inode,
1449 struct ceph_mds_session *session,
1450 struct ceph_cap_snap *capsnap,
1451 u32 mseq, u64 oldest_flush_tid)
1452{
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001453 struct cap_msg_args arg;
1454
1455 arg.session = session;
1456 arg.ino = ceph_vino(inode).ino;
1457 arg.cid = 0;
1458 arg.follows = capsnap->follows;
1459 arg.flush_tid = capsnap->cap_flush.tid;
1460 arg.oldest_flush_tid = oldest_flush_tid;
1461
1462 arg.size = capsnap->size;
1463 arg.max_size = 0;
1464 arg.xattr_version = capsnap->xattr_version;
1465 arg.xattr_buf = capsnap->xattr_blob;
Jeff Layton0a454bd2020-03-17 08:47:31 -04001466 arg.old_xattr_buf = NULL;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001467
1468 arg.atime = capsnap->atime;
1469 arg.mtime = capsnap->mtime;
1470 arg.ctime = capsnap->ctime;
Jeff Laytonec62b892019-05-29 12:23:14 -04001471 arg.btime = capsnap->btime;
Jeff Layton176c77c2019-06-06 08:06:40 -04001472 arg.change_attr = capsnap->change_attr;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001473
1474 arg.op = CEPH_CAP_OP_FLUSHSNAP;
1475 arg.caps = capsnap->issued;
1476 arg.wanted = 0;
1477 arg.dirty = capsnap->dirty;
1478
1479 arg.seq = 0;
1480 arg.issue_seq = 0;
1481 arg.mseq = mseq;
1482 arg.time_warp_seq = capsnap->time_warp_seq;
1483
1484 arg.uid = capsnap->uid;
1485 arg.gid = capsnap->gid;
1486 arg.mode = capsnap->mode;
1487
1488 arg.inline_data = capsnap->inline_data;
Jeff Layton1e4ef0c2016-11-10 07:42:06 -05001489 arg.flags = 0;
Jeff Layton0a454bd2020-03-17 08:47:31 -04001490 arg.wake = false;
Jeff Layton0ff8bfb2016-11-10 07:42:03 -05001491
1492 return send_cap_msg(&arg);
Yan, Zheng0e294382016-07-04 18:06:41 +08001493}
1494
Sage Weila8599bd2009-10-06 11:31:12 -07001495/*
1496 * When a snapshot is taken, clients accumulate dirty metadata on
1497 * inodes with capabilities in ceph_cap_snaps to describe the file
1498 * state at the time the snapshot was taken. This must be flushed
1499 * asynchronously back to the MDS once sync writes complete and dirty
1500 * data is written out.
1501 *
Sage Weilbe655592011-11-30 09:47:09 -08001502 * Called under i_ceph_lock. Takes s_mutex as needed.
Sage Weila8599bd2009-10-06 11:31:12 -07001503 */
Yan, Zhenged9b4302016-07-05 21:08:07 +08001504static void __ceph_flush_snaps(struct ceph_inode_info *ci,
1505 struct ceph_mds_session *session)
Sage Weilbe655592011-11-30 09:47:09 -08001506 __releases(ci->i_ceph_lock)
1507 __acquires(ci->i_ceph_lock)
Sage Weila8599bd2009-10-06 11:31:12 -07001508{
1509 struct inode *inode = &ci->vfs_inode;
Yan, Zhenged9b4302016-07-05 21:08:07 +08001510 struct ceph_mds_client *mdsc = session->s_mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07001511 struct ceph_cap_snap *capsnap;
Yan, Zhenged9b4302016-07-05 21:08:07 +08001512 u64 oldest_flush_tid = 0;
1513 u64 first_tid = 1, last_tid = 0;
Sage Weila8599bd2009-10-06 11:31:12 -07001514
Yan, Zhenged9b4302016-07-05 21:08:07 +08001515 dout("__flush_snaps %p session %p\n", inode, session);
Sage Weila8599bd2009-10-06 11:31:12 -07001516
Sage Weila8599bd2009-10-06 11:31:12 -07001517 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
Sage Weila8599bd2009-10-06 11:31:12 -07001518 /*
1519 * we need to wait for sync writes to complete and for dirty
1520 * pages to be written out.
1521 */
1522 if (capsnap->dirty_pages || capsnap->writing)
Sage Weilcfc0bf62010-09-14 15:50:59 -07001523 break;
Sage Weila8599bd2009-10-06 11:31:12 -07001524
Yan, Zheng86056092015-05-01 16:57:16 +08001525 /* should be removed by ceph_try_drop_cap_snap() */
1526 BUG_ON(!capsnap->need_flush);
Sage Weil819ccbf2010-04-01 09:33:46 -07001527
Sage Weile8351242010-09-17 08:03:08 -07001528 /* only flush each capsnap once */
Yan, Zheng0e294382016-07-04 18:06:41 +08001529 if (capsnap->cap_flush.tid > 0) {
Yan, Zhenged9b4302016-07-05 21:08:07 +08001530 dout(" already flushed %p, skipping\n", capsnap);
Sage Weile8351242010-09-17 08:03:08 -07001531 continue;
1532 }
1533
Yan, Zheng553adfd2015-06-09 15:48:57 +08001534 spin_lock(&mdsc->cap_dirty_lock);
Yan, Zheng0e294382016-07-04 18:06:41 +08001535 capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
1536 list_add_tail(&capsnap->cap_flush.g_list,
1537 &mdsc->cap_flush_list);
Yan, Zhenged9b4302016-07-05 21:08:07 +08001538 if (oldest_flush_tid == 0)
1539 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
Yan, Zheng0e294382016-07-04 18:06:41 +08001540 if (list_empty(&ci->i_flushing_item)) {
1541 list_add_tail(&ci->i_flushing_item,
1542 &session->s_cap_flushing);
1543 }
Yan, Zheng553adfd2015-06-09 15:48:57 +08001544 spin_unlock(&mdsc->cap_dirty_lock);
1545
Yan, Zheng0e294382016-07-04 18:06:41 +08001546 list_add_tail(&capsnap->cap_flush.i_list,
1547 &ci->i_cap_flush_list);
1548
Yan, Zhenged9b4302016-07-05 21:08:07 +08001549 if (first_tid == 1)
1550 first_tid = capsnap->cap_flush.tid;
1551 last_tid = capsnap->cap_flush.tid;
1552 }
1553
1554 ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
1555
1556 while (first_tid <= last_tid) {
1557 struct ceph_cap *cap = ci->i_auth_cap;
1558 struct ceph_cap_flush *cf;
1559 int ret;
1560
1561 if (!(cap && cap->session == session)) {
1562 dout("__flush_snaps %p auth cap %p not mds%d, "
1563 "stop\n", inode, cap, session->s_mds);
1564 break;
1565 }
1566
1567 ret = -ENOENT;
1568 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
1569 if (cf->tid >= first_tid) {
1570 ret = 0;
1571 break;
1572 }
1573 }
1574 if (ret < 0)
1575 break;
1576
1577 first_tid = cf->tid + 1;
1578
1579 capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
Elena Reshetova805692d2017-03-03 11:15:07 +02001580 refcount_inc(&capsnap->nref);
Sage Weilbe655592011-11-30 09:47:09 -08001581 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07001582
Yan, Zhenged9b4302016-07-05 21:08:07 +08001583 dout("__flush_snaps %p capsnap %p tid %llu %s\n",
1584 inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
Sage Weila8599bd2009-10-06 11:31:12 -07001585
Yan, Zhenged9b4302016-07-05 21:08:07 +08001586 ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
1587 oldest_flush_tid);
1588 if (ret < 0) {
1589 pr_err("__flush_snaps: error sending cap flushsnap, "
1590 "ino (%llx.%llx) tid %llu follows %llu\n",
1591 ceph_vinop(inode), cf->tid, capsnap->follows);
1592 }
1593
Sage Weila8599bd2009-10-06 11:31:12 -07001594 ceph_put_cap_snap(capsnap);
Sage Weilbe655592011-11-30 09:47:09 -08001595 spin_lock(&ci->i_ceph_lock);
Yan, Zhenged9b4302016-07-05 21:08:07 +08001596 }
1597}
1598
1599void ceph_flush_snaps(struct ceph_inode_info *ci,
1600 struct ceph_mds_session **psession)
1601{
1602 struct inode *inode = &ci->vfs_inode;
1603 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
Yan, Zhenge4d2b162016-08-04 08:43:33 +08001604 struct ceph_mds_session *session = NULL;
Yan, Zhenged9b4302016-07-05 21:08:07 +08001605 int mds;
Yan, Zhenge4d2b162016-08-04 08:43:33 +08001606
Yan, Zhenged9b4302016-07-05 21:08:07 +08001607 dout("ceph_flush_snaps %p\n", inode);
Yan, Zhenge4d2b162016-08-04 08:43:33 +08001608 if (psession)
1609 session = *psession;
Yan, Zhenged9b4302016-07-05 21:08:07 +08001610retry:
1611 spin_lock(&ci->i_ceph_lock);
1612 if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
1613 dout(" no capsnap needs flush, doing nothing\n");
1614 goto out;
1615 }
1616 if (!ci->i_auth_cap) {
1617 dout(" no auth cap (migrating?), doing nothing\n");
1618 goto out;
1619 }
1620
1621 mds = ci->i_auth_cap->session->s_mds;
1622 if (session && session->s_mds != mds) {
1623 dout(" oops, wrong session %p mutex\n", session);
1624 mutex_unlock(&session->s_mutex);
1625 ceph_put_mds_session(session);
1626 session = NULL;
1627 }
1628 if (!session) {
1629 spin_unlock(&ci->i_ceph_lock);
1630 mutex_lock(&mdsc->mutex);
1631 session = __ceph_lookup_mds_session(mdsc, mds);
1632 mutex_unlock(&mdsc->mutex);
1633 if (session) {
1634 dout(" inverting session/ino locks on %p\n", session);
1635 mutex_lock(&session->s_mutex);
1636 }
Sage Weila8599bd2009-10-06 11:31:12 -07001637 goto retry;
1638 }
1639
Yan, Zheng24d063a2017-08-15 11:37:32 +08001640 // make sure flushsnap messages are sent in proper order.
Yan, Zheng054f8d42019-06-20 16:00:31 +08001641 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
Yan, Zheng24d063a2017-08-15 11:37:32 +08001642 __kick_flushing_caps(mdsc, session, ci, 0);
Yan, Zheng24d063a2017-08-15 11:37:32 +08001643
Yan, Zhenged9b4302016-07-05 21:08:07 +08001644 __ceph_flush_snaps(ci, session);
1645out:
1646 spin_unlock(&ci->i_ceph_lock);
1647
1648 if (psession) {
1649 *psession = session;
Yan, Zhengc858a072017-08-28 15:02:42 +08001650 } else if (session) {
Yan, Zhenged9b4302016-07-05 21:08:07 +08001651 mutex_unlock(&session->s_mutex);
1652 ceph_put_mds_session(session);
1653 }
Sage Weila8599bd2009-10-06 11:31:12 -07001654 /* we flushed them all; remove this inode from the queue */
1655 spin_lock(&mdsc->snap_flush_lock);
1656 list_del_init(&ci->i_snap_flush_item);
1657 spin_unlock(&mdsc->snap_flush_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07001658}
1659
1660/*
Sage Weilfca65b42011-05-04 11:33:47 -07001661 * Mark caps dirty. If inode is newly dirty, return the dirty flags.
1662 * Caller is then responsible for calling __mark_inode_dirty with the
1663 * returned flags value.
Sage Weil76e3b392009-10-15 18:13:53 -07001664 */
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001665int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
1666 struct ceph_cap_flush **pcf)
Sage Weil76e3b392009-10-15 18:13:53 -07001667{
Cheng Renquan640ef792010-03-26 17:40:33 +08001668 struct ceph_mds_client *mdsc =
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07001669 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
Sage Weil76e3b392009-10-15 18:13:53 -07001670 struct inode *inode = &ci->vfs_inode;
1671 int was = ci->i_dirty_caps;
1672 int dirty = 0;
1673
Jeff Laytonc7e4f852020-02-25 11:08:33 -08001674 lockdep_assert_held(&ci->i_ceph_lock);
1675
Yan, Zheng571ade32015-03-24 11:36:08 +08001676 if (!ci->i_auth_cap) {
1677 pr_warn("__mark_dirty_caps %p %llx mask %s, "
1678 "but no auth cap (session was closed?)\n",
1679 inode, ceph_ino(inode), ceph_cap_string(mask));
1680 return 0;
1681 }
1682
Sage Weil76e3b392009-10-15 18:13:53 -07001683 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1684 ceph_cap_string(mask), ceph_cap_string(was),
1685 ceph_cap_string(was | mask));
1686 ci->i_dirty_caps |= mask;
1687 if (was == 0) {
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001688 WARN_ON_ONCE(ci->i_prealloc_cap_flush);
1689 swap(ci->i_prealloc_cap_flush, *pcf);
1690
Yan, Zheng604d1b02015-05-01 17:49:16 +08001691 if (!ci->i_head_snapc) {
1692 WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem));
Sage Weil7d8cb262010-08-24 08:44:16 -07001693 ci->i_head_snapc = ceph_get_snap_context(
1694 ci->i_snap_realm->cached_context);
Yan, Zheng604d1b02015-05-01 17:49:16 +08001695 }
Yan, Zheng06852352012-11-19 10:49:07 +08001696 dout(" inode %p now dirty snapc %p auth cap %p\n",
1697 &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap);
Sage Weil76e3b392009-10-15 18:13:53 -07001698 BUG_ON(!list_empty(&ci->i_dirty_item));
1699 spin_lock(&mdsc->cap_dirty_lock);
Yan, Zheng11df2df2013-11-24 14:44:38 +08001700 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
Sage Weil76e3b392009-10-15 18:13:53 -07001701 spin_unlock(&mdsc->cap_dirty_lock);
1702 if (ci->i_flushing_caps == 0) {
Sage Weil3772d262011-05-03 09:28:08 -07001703 ihold(inode);
Sage Weil76e3b392009-10-15 18:13:53 -07001704 dirty |= I_DIRTY_SYNC;
1705 }
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001706 } else {
1707 WARN_ON_ONCE(!ci->i_prealloc_cap_flush);
Sage Weil76e3b392009-10-15 18:13:53 -07001708 }
1709 BUG_ON(list_empty(&ci->i_dirty_item));
1710 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1711 (mask & CEPH_CAP_FILE_BUFFER))
1712 dirty |= I_DIRTY_DATASYNC;
Yan, Zhenga0d93e32020-03-05 20:21:01 +08001713 __cap_delay_requeue(mdsc, ci);
Sage Weilfca65b42011-05-04 11:33:47 -07001714 return dirty;
Sage Weil76e3b392009-10-15 18:13:53 -07001715}
1716
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001717struct ceph_cap_flush *ceph_alloc_cap_flush(void)
1718{
1719 return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
1720}
1721
1722void ceph_free_cap_flush(struct ceph_cap_flush *cf)
1723{
1724 if (cf)
1725 kmem_cache_free(ceph_cap_flush_cachep, cf);
1726}
1727
Yan, Zhenga2971c82015-06-10 11:09:32 +08001728static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
1729{
Yan, Zhenge4500b52016-07-06 11:12:56 +08001730 if (!list_empty(&mdsc->cap_flush_list)) {
Yan, Zhenga2971c82015-06-10 11:09:32 +08001731 struct ceph_cap_flush *cf =
Yan, Zhenge4500b52016-07-06 11:12:56 +08001732 list_first_entry(&mdsc->cap_flush_list,
1733 struct ceph_cap_flush, g_list);
Yan, Zhenga2971c82015-06-10 11:09:32 +08001734 return cf->tid;
1735 }
1736 return 0;
1737}
1738
Sage Weil76e3b392009-10-15 18:13:53 -07001739/*
Yan, Zhengc8799fc2016-07-07 15:22:38 +08001740 * Remove cap_flush from the mdsc's or inode's flushing cap list.
1741 * Return true if caller needs to wake up flush waiters.
1742 */
Jeff Layton681ac632020-03-18 15:29:34 -04001743static bool __detach_cap_flush_from_mdsc(struct ceph_mds_client *mdsc,
1744 struct ceph_cap_flush *cf)
Yan, Zhengc8799fc2016-07-07 15:22:38 +08001745{
1746 struct ceph_cap_flush *prev;
1747 bool wake = cf->wake;
Jeff Layton681ac632020-03-18 15:29:34 -04001748
1749 if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
1750 prev = list_prev_entry(cf, g_list);
1751 prev->wake = true;
1752 wake = false;
Yan, Zhengc8799fc2016-07-07 15:22:38 +08001753 }
Jeff Layton681ac632020-03-18 15:29:34 -04001754 list_del(&cf->g_list);
1755 return wake;
1756}
1757
1758static bool __detach_cap_flush_from_ci(struct ceph_inode_info *ci,
1759 struct ceph_cap_flush *cf)
1760{
1761 struct ceph_cap_flush *prev;
1762 bool wake = cf->wake;
1763
1764 if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
1765 prev = list_prev_entry(cf, i_list);
1766 prev->wake = true;
1767 wake = false;
1768 }
1769 list_del(&cf->i_list);
Yan, Zhengc8799fc2016-07-07 15:22:38 +08001770 return wake;
1771}
1772
1773/*
Sage Weila8599bd2009-10-06 11:31:12 -07001774 * Add dirty inode to the flushing list. Assigned a seq number so we
1775 * can wait for caps to flush without starving.
Sage Weilcdc35f92009-10-14 14:24:19 -07001776 *
Jeff Layton9f3345d2019-07-08 12:27:57 -04001777 * Called under i_ceph_lock. Returns the flush tid.
Sage Weila8599bd2009-10-06 11:31:12 -07001778 */
Jeff Layton9f3345d2019-07-08 12:27:57 -04001779static u64 __mark_caps_flushing(struct inode *inode,
Yan, Zhengc8799fc2016-07-07 15:22:38 +08001780 struct ceph_mds_session *session, bool wake,
Jeff Layton9f3345d2019-07-08 12:27:57 -04001781 u64 *oldest_flush_tid)
Sage Weila8599bd2009-10-06 11:31:12 -07001782{
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07001783 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07001784 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001785 struct ceph_cap_flush *cf = NULL;
Sage Weilcdc35f92009-10-14 14:24:19 -07001786 int flushing;
Sage Weil50b885b2009-12-01 14:12:07 -08001787
Jeff Laytonc7e4f852020-02-25 11:08:33 -08001788 lockdep_assert_held(&ci->i_ceph_lock);
Sage Weilcdc35f92009-10-14 14:24:19 -07001789 BUG_ON(ci->i_dirty_caps == 0);
Sage Weila8599bd2009-10-06 11:31:12 -07001790 BUG_ON(list_empty(&ci->i_dirty_item));
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001791 BUG_ON(!ci->i_prealloc_cap_flush);
Sage Weilcdc35f92009-10-14 14:24:19 -07001792
1793 flushing = ci->i_dirty_caps;
1794 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1795 ceph_cap_string(flushing),
1796 ceph_cap_string(ci->i_flushing_caps),
1797 ceph_cap_string(ci->i_flushing_caps | flushing));
1798 ci->i_flushing_caps |= flushing;
1799 ci->i_dirty_caps = 0;
Sage Weilafcdaea2009-10-14 14:27:38 -07001800 dout(" inode %p now !dirty\n", inode);
Sage Weilcdc35f92009-10-14 14:24:19 -07001801
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001802 swap(cf, ci->i_prealloc_cap_flush);
Yan, Zheng553adfd2015-06-09 15:48:57 +08001803 cf->caps = flushing;
Yan, Zhengc8799fc2016-07-07 15:22:38 +08001804 cf->wake = wake;
Yan, Zheng553adfd2015-06-09 15:48:57 +08001805
Sage Weila8599bd2009-10-06 11:31:12 -07001806 spin_lock(&mdsc->cap_dirty_lock);
Sage Weilafcdaea2009-10-14 14:27:38 -07001807 list_del_init(&ci->i_dirty_item);
1808
Yan, Zheng553adfd2015-06-09 15:48:57 +08001809 cf->tid = ++mdsc->last_cap_flush_tid;
Yan, Zhenge4500b52016-07-06 11:12:56 +08001810 list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
Yan, Zhenga2971c82015-06-10 11:09:32 +08001811 *oldest_flush_tid = __get_oldest_flush_tid(mdsc);
Yan, Zheng553adfd2015-06-09 15:48:57 +08001812
Sage Weila8599bd2009-10-06 11:31:12 -07001813 if (list_empty(&ci->i_flushing_item)) {
1814 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1815 mdsc->num_cap_flushing++;
Sage Weila8599bd2009-10-06 11:31:12 -07001816 }
1817 spin_unlock(&mdsc->cap_dirty_lock);
Sage Weilcdc35f92009-10-14 14:24:19 -07001818
Yan, Zhenge4500b52016-07-06 11:12:56 +08001819 list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
Yan, Zheng553adfd2015-06-09 15:48:57 +08001820
Jeff Layton9f3345d2019-07-08 12:27:57 -04001821 return cf->tid;
Sage Weila8599bd2009-10-06 11:31:12 -07001822}
1823
1824/*
Sage Weil5ecad6f2010-02-17 10:43:37 -08001825 * try to invalidate mapping pages without blocking.
1826 */
Sage Weil5ecad6f2010-02-17 10:43:37 -08001827static int try_nonblocking_invalidate(struct inode *inode)
1828{
1829 struct ceph_inode_info *ci = ceph_inode(inode);
1830 u32 invalidating_gen = ci->i_rdcache_gen;
1831
Sage Weilbe655592011-11-30 09:47:09 -08001832 spin_unlock(&ci->i_ceph_lock);
Sage Weil5ecad6f2010-02-17 10:43:37 -08001833 invalidate_mapping_pages(&inode->i_data, 0, -1);
Sage Weilbe655592011-11-30 09:47:09 -08001834 spin_lock(&ci->i_ceph_lock);
Sage Weil5ecad6f2010-02-17 10:43:37 -08001835
Sage Weil18a38192010-09-17 10:46:44 -07001836 if (inode->i_data.nrpages == 0 &&
Sage Weil5ecad6f2010-02-17 10:43:37 -08001837 invalidating_gen == ci->i_rdcache_gen) {
1838 /* success. */
1839 dout("try_nonblocking_invalidate %p success\n", inode);
Sage Weilcd045cb2010-11-04 11:05:05 -07001840 /* save any racing async invalidate some trouble */
1841 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
Sage Weil5ecad6f2010-02-17 10:43:37 -08001842 return 0;
1843 }
1844 dout("try_nonblocking_invalidate %p failed\n", inode);
1845 return -1;
1846}
1847
Yan, Zhengefb0ca72017-05-22 12:03:32 +08001848bool __ceph_should_report_size(struct ceph_inode_info *ci)
1849{
1850 loff_t size = ci->vfs_inode.i_size;
1851 /* mds will adjust max size according to the reported size */
1852 if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
1853 return false;
1854 if (size >= ci->i_max_size)
1855 return true;
1856 /* half of previous max_size increment has been used */
1857 if (ci->i_max_size > ci->i_reported_size &&
1858 (size << 1) >= ci->i_max_size + ci->i_reported_size)
1859 return true;
1860 return false;
1861}
1862
Sage Weil5ecad6f2010-02-17 10:43:37 -08001863/*
Sage Weila8599bd2009-10-06 11:31:12 -07001864 * Swiss army knife function to examine currently used and wanted
1865 * versus held caps. Release, flush, ack revoked caps to mds as
1866 * appropriate.
1867 *
Sage Weila8599bd2009-10-06 11:31:12 -07001868 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1869 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1870 * further delay.
1871 */
1872void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1873 struct ceph_mds_session *session)
1874{
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07001875 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1876 struct ceph_mds_client *mdsc = fsc->mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07001877 struct inode *inode = &ci->vfs_inode;
1878 struct ceph_cap *cap;
Yan, Zhenga2971c82015-06-10 11:09:32 +08001879 u64 flush_tid, oldest_flush_tid;
Yan, Zheng395c3122013-01-04 14:37:57 +08001880 int file_wanted, used, cap_used;
Sage Weila8599bd2009-10-06 11:31:12 -07001881 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
Sage Weilcbd03632010-02-09 13:41:18 -08001882 int issued, implemented, want, retain, revoking, flushing = 0;
Sage Weila8599bd2009-10-06 11:31:12 -07001883 int mds = -1; /* keep track of how far we've gone through i_caps list
1884 to avoid an infinite loop on retry */
1885 struct rb_node *p;
Yan, Zheng36094042016-07-06 15:37:42 +08001886 bool queue_invalidate = false;
Yan, Zheng36094042016-07-06 15:37:42 +08001887 bool tried_invalidate = false;
Sage Weila8599bd2009-10-06 11:31:12 -07001888
Sage Weilbe655592011-11-30 09:47:09 -08001889 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07001890 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1891 flags |= CHECK_CAPS_FLUSH;
1892
Sage Weila8599bd2009-10-06 11:31:12 -07001893 goto retry_locked;
1894retry:
Sage Weilbe655592011-11-30 09:47:09 -08001895 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07001896retry_locked:
1897 file_wanted = __ceph_caps_file_wanted(ci);
1898 used = __ceph_caps_used(ci);
Sage Weilcbd03632010-02-09 13:41:18 -08001899 issued = __ceph_caps_issued(ci, &implemented);
1900 revoking = implemented & ~issued;
Sage Weila8599bd2009-10-06 11:31:12 -07001901
Yan, Zheng41445992015-05-25 17:36:42 +08001902 want = file_wanted;
1903 retain = file_wanted | used | CEPH_CAP_PIN;
Sage Weila8599bd2009-10-06 11:31:12 -07001904 if (!mdsc->stopping && inode->i_nlink > 0) {
Yan, Zheng41445992015-05-25 17:36:42 +08001905 if (file_wanted) {
Sage Weila8599bd2009-10-06 11:31:12 -07001906 retain |= CEPH_CAP_ANY; /* be greedy */
Yan, Zheng32ec4392015-03-26 19:06:00 +08001907 } else if (S_ISDIR(inode->i_mode) &&
1908 (issued & CEPH_CAP_FILE_SHARED) &&
Yan, Zheng8a2ac3a2018-12-05 11:29:35 +08001909 __ceph_dir_is_complete(ci)) {
Yan, Zheng32ec4392015-03-26 19:06:00 +08001910 /*
1911 * If a directory is complete, we want to keep
1912 * the exclusive cap. So that MDS does not end up
1913 * revoking the shared cap on every create/unlink
1914 * operation.
1915 */
Jeff Laytona25949b2020-02-18 14:12:45 -05001916 if (IS_RDONLY(inode)) {
Yan, Zheng8a2ac3a2018-12-05 11:29:35 +08001917 want = CEPH_CAP_ANY_SHARED;
Jeff Laytona25949b2020-02-18 14:12:45 -05001918 } else {
Yan, Zheng719a2512020-03-05 20:21:00 +08001919 want |= CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
Jeff Laytona25949b2020-02-18 14:12:45 -05001920 }
Yan, Zheng32ec4392015-03-26 19:06:00 +08001921 retain |= want;
Sage Weila8599bd2009-10-06 11:31:12 -07001922 } else {
Yan, Zheng32ec4392015-03-26 19:06:00 +08001923
Sage Weila8599bd2009-10-06 11:31:12 -07001924 retain |= CEPH_CAP_ANY_SHARED;
1925 /*
1926 * keep RD only if we didn't have the file open RW,
1927 * because then the mds would revoke it anyway to
1928 * journal max_size=0.
1929 */
1930 if (ci->i_max_size == 0)
1931 retain |= CEPH_CAP_ANY_RD;
1932 }
1933 }
1934
1935 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
Yan, Zhenga0d93e32020-03-05 20:21:01 +08001936 " issued %s revoking %s retain %s %s%s\n", inode,
Sage Weila8599bd2009-10-06 11:31:12 -07001937 ceph_cap_string(file_wanted),
1938 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1939 ceph_cap_string(ci->i_flushing_caps),
Sage Weilcbd03632010-02-09 13:41:18 -08001940 ceph_cap_string(issued), ceph_cap_string(revoking),
Sage Weila8599bd2009-10-06 11:31:12 -07001941 ceph_cap_string(retain),
1942 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
Sage Weila8599bd2009-10-06 11:31:12 -07001943 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1944
1945 /*
1946 * If we no longer need to hold onto old our caps, and we may
1947 * have cached pages, but don't want them, then try to invalidate.
1948 * If we fail, it's because pages are locked.... try again later.
1949 */
Yan, Zhenga0d93e32020-03-05 20:21:01 +08001950 if ((!(flags & CHECK_CAPS_NOINVAL) || mdsc->stopping) &&
Yan, Zheng525d15e2019-05-11 17:27:59 +08001951 S_ISREG(inode->i_mode) &&
Yan, Zheng9abd4db2016-05-18 20:58:26 +08001952 !(ci->i_wb_ref || ci->i_wrbuffer_ref) && /* no dirty pages... */
Yan, Zhengfdd4e152015-06-16 20:48:56 +08001953 inode->i_data.nrpages && /* have cached pages */
Yan, Zheng5e804ac2015-10-26 16:08:43 +08001954 (revoking & (CEPH_CAP_FILE_CACHE|
1955 CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */
Sage Weila8599bd2009-10-06 11:31:12 -07001956 !tried_invalidate) {
Sage Weila8599bd2009-10-06 11:31:12 -07001957 dout("check_caps trying to invalidate on %p\n", inode);
Sage Weil5ecad6f2010-02-17 10:43:37 -08001958 if (try_nonblocking_invalidate(inode) < 0) {
Yan, Zhengee612d92018-01-08 14:36:25 +08001959 dout("check_caps queuing invalidate\n");
1960 queue_invalidate = true;
1961 ci->i_rdcache_revoking = ci->i_rdcache_gen;
Sage Weila8599bd2009-10-06 11:31:12 -07001962 }
Yan, Zheng36094042016-07-06 15:37:42 +08001963 tried_invalidate = true;
Sage Weila8599bd2009-10-06 11:31:12 -07001964 goto retry_locked;
1965 }
1966
Sage Weila8599bd2009-10-06 11:31:12 -07001967 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
Jeff Layton0a454bd2020-03-17 08:47:31 -04001968 struct cap_msg_args arg;
1969
Sage Weila8599bd2009-10-06 11:31:12 -07001970 cap = rb_entry(p, struct ceph_cap, ci_node);
Sage Weila8599bd2009-10-06 11:31:12 -07001971
1972 /* avoid looping forever */
1973 if (mds >= cap->mds ||
1974 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1975 continue;
1976
1977 /* NOTE: no side-effects allowed, until we take s_mutex */
1978
Yan, Zheng395c3122013-01-04 14:37:57 +08001979 cap_used = used;
1980 if (ci->i_auth_cap && cap != ci->i_auth_cap)
1981 cap_used &= ~ci->i_auth_cap->issued;
1982
Sage Weila8599bd2009-10-06 11:31:12 -07001983 revoking = cap->implemented & ~cap->issued;
Yan, Zheng395c3122013-01-04 14:37:57 +08001984 dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
Yan, Zheng9abd4db2016-05-18 20:58:26 +08001985 cap->mds, cap, ceph_cap_string(cap_used),
1986 ceph_cap_string(cap->issued),
Sage Weil088b3f52011-01-18 08:56:01 -08001987 ceph_cap_string(cap->implemented),
1988 ceph_cap_string(revoking));
Sage Weila8599bd2009-10-06 11:31:12 -07001989
1990 if (cap == ci->i_auth_cap &&
1991 (cap->issued & CEPH_CAP_FILE_WR)) {
1992 /* request larger max_size from MDS? */
1993 if (ci->i_wanted_max_size > ci->i_max_size &&
1994 ci->i_wanted_max_size > ci->i_requested_max_size) {
1995 dout("requesting new max_size\n");
1996 goto ack;
1997 }
1998
1999 /* approaching file_max? */
Yan, Zhengefb0ca72017-05-22 12:03:32 +08002000 if (__ceph_should_report_size(ci)) {
Sage Weila8599bd2009-10-06 11:31:12 -07002001 dout("i_size approaching max_size\n");
2002 goto ack;
2003 }
2004 }
2005 /* flush anything dirty? */
Yan, Zheng7bc00fd2016-07-07 18:34:45 +08002006 if (cap == ci->i_auth_cap) {
2007 if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
2008 dout("flushing dirty caps\n");
2009 goto ack;
2010 }
2011 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
2012 dout("flushing snap caps\n");
2013 goto ack;
2014 }
Sage Weila8599bd2009-10-06 11:31:12 -07002015 }
2016
2017 /* completed revocation? going down and there are no caps? */
Yan, Zheng395c3122013-01-04 14:37:57 +08002018 if (revoking && (revoking & cap_used) == 0) {
Sage Weila8599bd2009-10-06 11:31:12 -07002019 dout("completed revocation of %s\n",
2020 ceph_cap_string(cap->implemented & ~cap->issued));
2021 goto ack;
2022 }
2023
2024 /* want more caps from mds? */
Yan, Zheng0aa971b2020-03-10 19:34:20 +08002025 if (want & ~cap->mds_wanted) {
2026 if (want & ~(cap->mds_wanted | cap->issued))
2027 goto ack;
2028 if (!__cap_is_valid(cap))
2029 goto ack;
2030 }
Sage Weila8599bd2009-10-06 11:31:12 -07002031
2032 /* things we might delay */
Yan, Zhengfdac94f2018-11-22 15:26:01 +08002033 if ((cap->issued & ~retain) == 0)
Sage Weila8599bd2009-10-06 11:31:12 -07002034 continue; /* nope, all good */
2035
Sage Weila8599bd2009-10-06 11:31:12 -07002036ack:
2037 if (session && session != cap->session) {
2038 dout("oops, wrong session %p mutex\n", session);
2039 mutex_unlock(&session->s_mutex);
2040 session = NULL;
2041 }
2042 if (!session) {
2043 session = cap->session;
2044 if (mutex_trylock(&session->s_mutex) == 0) {
2045 dout("inverting session/ino locks on %p\n",
2046 session);
Sage Weilbe655592011-11-30 09:47:09 -08002047 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002048 if (took_snap_rwsem) {
2049 up_read(&mdsc->snap_rwsem);
2050 took_snap_rwsem = 0;
2051 }
2052 mutex_lock(&session->s_mutex);
2053 goto retry;
2054 }
2055 }
Yan, Zheng7bc00fd2016-07-07 18:34:45 +08002056
2057 /* kick flushing and flush snaps before sending normal
2058 * cap message */
2059 if (cap == ci->i_auth_cap &&
2060 (ci->i_ceph_flags &
2061 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
Yan, Zheng054f8d42019-06-20 16:00:31 +08002062 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
Yan, Zheng24d063a2017-08-15 11:37:32 +08002063 __kick_flushing_caps(mdsc, session, ci, 0);
Yan, Zhenged9b4302016-07-05 21:08:07 +08002064 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2065 __ceph_flush_snaps(ci, session);
2066
Yan, Zheng7bc00fd2016-07-07 18:34:45 +08002067 goto retry_locked;
2068 }
2069
Sage Weila8599bd2009-10-06 11:31:12 -07002070 /* take snap_rwsem after session mutex */
2071 if (!took_snap_rwsem) {
2072 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
2073 dout("inverting snap/in locks on %p\n",
2074 inode);
Sage Weilbe655592011-11-30 09:47:09 -08002075 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002076 down_read(&mdsc->snap_rwsem);
2077 took_snap_rwsem = 1;
2078 goto retry;
2079 }
2080 took_snap_rwsem = 1;
2081 }
2082
Yan, Zheng553adfd2015-06-09 15:48:57 +08002083 if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
Jeff Layton9f3345d2019-07-08 12:27:57 -04002084 flushing = ci->i_dirty_caps;
2085 flush_tid = __mark_caps_flushing(inode, session, false,
2086 &oldest_flush_tid);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002087 } else {
Sage Weil24be0c42011-01-18 08:48:06 -08002088 flushing = 0;
Yan, Zheng553adfd2015-06-09 15:48:57 +08002089 flush_tid = 0;
Yan, Zhenga2971c82015-06-10 11:09:32 +08002090 spin_lock(&mdsc->cap_dirty_lock);
2091 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2092 spin_unlock(&mdsc->cap_dirty_lock);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002093 }
Sage Weila8599bd2009-10-06 11:31:12 -07002094
2095 mds = cap->mds; /* remember mds, so we don't repeat */
Sage Weila8599bd2009-10-06 11:31:12 -07002096
Jeff Layton0a454bd2020-03-17 08:47:31 -04002097 __prep_cap(&arg, cap, CEPH_CAP_OP_UPDATE, 0, cap_used, want,
Yan, Zhenga0d93e32020-03-05 20:21:01 +08002098 retain, flushing, flush_tid, oldest_flush_tid);
Jeff Layton0a454bd2020-03-17 08:47:31 -04002099 spin_unlock(&ci->i_ceph_lock);
2100
2101 __send_cap(mdsc, &arg, ci);
2102
Sage Weilbe655592011-11-30 09:47:09 -08002103 goto retry; /* retake i_ceph_lock and restart our cap scan. */
Sage Weila8599bd2009-10-06 11:31:12 -07002104 }
2105
Yan, Zhenga0d93e32020-03-05 20:21:01 +08002106 /* periodically re-calculate caps wanted by open files */
2107 if (__ceph_is_any_real_caps(ci) &&
2108 list_empty(&ci->i_cap_delay_list) &&
2109 (file_wanted & ~CEPH_CAP_PIN) &&
2110 !(used & (CEPH_CAP_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
2111 __cap_delay_requeue(mdsc, ci);
Yan, Zheng719a2512020-03-05 20:21:00 +08002112 }
Sage Weila8599bd2009-10-06 11:31:12 -07002113
Sage Weilbe655592011-11-30 09:47:09 -08002114 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002115
Sage Weilcbd03632010-02-09 13:41:18 -08002116 if (queue_invalidate)
Sage Weil3c6f6b72010-02-09 15:24:44 -08002117 ceph_queue_invalidate(inode);
Sage Weilcbd03632010-02-09 13:41:18 -08002118
Sage Weilcdc2ce02010-03-16 13:39:28 -07002119 if (session)
Sage Weila8599bd2009-10-06 11:31:12 -07002120 mutex_unlock(&session->s_mutex);
2121 if (took_snap_rwsem)
2122 up_read(&mdsc->snap_rwsem);
2123}
2124
2125/*
Sage Weila8599bd2009-10-06 11:31:12 -07002126 * Try to flush dirty caps back to the auth mds.
2127 */
Yan, Zheng553adfd2015-06-09 15:48:57 +08002128static int try_flush_caps(struct inode *inode, u64 *ptid)
Sage Weila8599bd2009-10-06 11:31:12 -07002129{
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002130 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07002131 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng4fe59782013-10-31 16:44:14 +08002132 struct ceph_mds_session *session = NULL;
Yan, Zheng89b52fe2015-05-27 09:59:48 +08002133 int flushing = 0;
Yan, Zhenga2971c82015-06-10 11:09:32 +08002134 u64 flush_tid = 0, oldest_flush_tid = 0;
Sage Weila8599bd2009-10-06 11:31:12 -07002135
2136retry:
Sage Weilbe655592011-11-30 09:47:09 -08002137 spin_lock(&ci->i_ceph_lock);
Yan, Zhengd6cee9d2019-06-20 16:09:37 +08002138retry_locked:
Sage Weila8599bd2009-10-06 11:31:12 -07002139 if (ci->i_dirty_caps && ci->i_auth_cap) {
2140 struct ceph_cap *cap = ci->i_auth_cap;
Jeff Layton0a454bd2020-03-17 08:47:31 -04002141 struct cap_msg_args arg;
Sage Weila8599bd2009-10-06 11:31:12 -07002142
Jeff Layton27b0a392019-07-05 13:26:29 -04002143 if (session != cap->session) {
Sage Weilbe655592011-11-30 09:47:09 -08002144 spin_unlock(&ci->i_ceph_lock);
Yan, Zheng4fe59782013-10-31 16:44:14 +08002145 if (session)
2146 mutex_unlock(&session->s_mutex);
Sage Weila8599bd2009-10-06 11:31:12 -07002147 session = cap->session;
2148 mutex_lock(&session->s_mutex);
2149 goto retry;
2150 }
Jeff Layton6c2838f2017-10-19 08:52:58 -04002151 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) {
2152 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002153 goto out;
Jeff Layton6c2838f2017-10-19 08:52:58 -04002154 }
Sage Weila8599bd2009-10-06 11:31:12 -07002155
Yan, Zhengd6cee9d2019-06-20 16:09:37 +08002156 if (ci->i_ceph_flags &
2157 (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS)) {
2158 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH)
2159 __kick_flushing_caps(mdsc, session, ci, 0);
2160 if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
2161 __ceph_flush_snaps(ci, session);
2162 goto retry_locked;
2163 }
2164
Jeff Layton9f3345d2019-07-08 12:27:57 -04002165 flushing = ci->i_dirty_caps;
2166 flush_tid = __mark_caps_flushing(inode, session, true,
2167 &oldest_flush_tid);
Sage Weila8599bd2009-10-06 11:31:12 -07002168
Jeff Layton0a454bd2020-03-17 08:47:31 -04002169 __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH, CEPH_CLIENT_CAPS_SYNC,
Yan, Zhenga0d93e32020-03-05 20:21:01 +08002170 __ceph_caps_used(ci), __ceph_caps_wanted(ci),
2171 (cap->issued | cap->implemented),
2172 flushing, flush_tid, oldest_flush_tid);
Jeff Layton0a454bd2020-03-17 08:47:31 -04002173 spin_unlock(&ci->i_ceph_lock);
2174
2175 __send_cap(mdsc, &arg, ci);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002176 } else {
Yan, Zhenge4500b52016-07-06 11:12:56 +08002177 if (!list_empty(&ci->i_cap_flush_list)) {
Yan, Zheng553adfd2015-06-09 15:48:57 +08002178 struct ceph_cap_flush *cf =
Yan, Zhenge4500b52016-07-06 11:12:56 +08002179 list_last_entry(&ci->i_cap_flush_list,
Yan, Zhengc8799fc2016-07-07 15:22:38 +08002180 struct ceph_cap_flush, i_list);
2181 cf->wake = true;
Yan, Zheng553adfd2015-06-09 15:48:57 +08002182 flush_tid = cf->tid;
2183 }
2184 flushing = ci->i_flushing_caps;
2185 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002186 }
2187out:
Yan, Zheng4fe59782013-10-31 16:44:14 +08002188 if (session)
Sage Weila8599bd2009-10-06 11:31:12 -07002189 mutex_unlock(&session->s_mutex);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002190
2191 *ptid = flush_tid;
Sage Weila8599bd2009-10-06 11:31:12 -07002192 return flushing;
2193}
2194
2195/*
2196 * Return true if we've flushed caps through the given flush_tid.
2197 */
Yan, Zheng553adfd2015-06-09 15:48:57 +08002198static int caps_are_flushed(struct inode *inode, u64 flush_tid)
Sage Weila8599bd2009-10-06 11:31:12 -07002199{
2200 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002201 int ret = 1;
Sage Weila8599bd2009-10-06 11:31:12 -07002202
Sage Weilbe655592011-11-30 09:47:09 -08002203 spin_lock(&ci->i_ceph_lock);
Yan, Zhenge4500b52016-07-06 11:12:56 +08002204 if (!list_empty(&ci->i_cap_flush_list)) {
2205 struct ceph_cap_flush * cf =
2206 list_first_entry(&ci->i_cap_flush_list,
2207 struct ceph_cap_flush, i_list);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002208 if (cf->tid <= flush_tid)
Sage Weila8599bd2009-10-06 11:31:12 -07002209 ret = 0;
Yan, Zheng89b52fe2015-05-27 09:59:48 +08002210 }
Sage Weilbe655592011-11-30 09:47:09 -08002211 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002212 return ret;
2213}
2214
2215/*
Yan, Zheng68cd5b42015-10-27 18:36:06 +08002216 * wait for any unsafe requests to complete.
Yan, Zhengda819c82015-05-27 11:19:34 +08002217 */
Yan, Zheng68cd5b42015-10-27 18:36:06 +08002218static int unsafe_request_wait(struct inode *inode)
Yan, Zhengda819c82015-05-27 11:19:34 +08002219{
2220 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng68cd5b42015-10-27 18:36:06 +08002221 struct ceph_mds_request *req1 = NULL, *req2 = NULL;
2222 int ret, err = 0;
Yan, Zhengda819c82015-05-27 11:19:34 +08002223
2224 spin_lock(&ci->i_unsafe_lock);
Yan, Zheng68cd5b42015-10-27 18:36:06 +08002225 if (S_ISDIR(inode->i_mode) && !list_empty(&ci->i_unsafe_dirops)) {
2226 req1 = list_last_entry(&ci->i_unsafe_dirops,
2227 struct ceph_mds_request,
2228 r_unsafe_dir_item);
2229 ceph_mdsc_get_request(req1);
2230 }
2231 if (!list_empty(&ci->i_unsafe_iops)) {
2232 req2 = list_last_entry(&ci->i_unsafe_iops,
2233 struct ceph_mds_request,
2234 r_unsafe_target_item);
2235 ceph_mdsc_get_request(req2);
2236 }
Yan, Zhengda819c82015-05-27 11:19:34 +08002237 spin_unlock(&ci->i_unsafe_lock);
Yan, Zheng68cd5b42015-10-27 18:36:06 +08002238
Jeff Layton4945a082016-11-16 09:45:22 -05002239 dout("unsafe_request_wait %p wait on tid %llu %llu\n",
Yan, Zheng68cd5b42015-10-27 18:36:06 +08002240 inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
2241 if (req1) {
2242 ret = !wait_for_completion_timeout(&req1->r_safe_completion,
2243 ceph_timeout_jiffies(req1->r_timeout));
2244 if (ret)
2245 err = -EIO;
2246 ceph_mdsc_put_request(req1);
2247 }
2248 if (req2) {
2249 ret = !wait_for_completion_timeout(&req2->r_safe_completion,
2250 ceph_timeout_jiffies(req2->r_timeout));
2251 if (ret)
2252 err = -EIO;
2253 ceph_mdsc_put_request(req2);
2254 }
2255 return err;
Yan, Zhengda819c82015-05-27 11:19:34 +08002256}
2257
Josef Bacik02c24a82011-07-16 20:44:56 -04002258int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
Sage Weila8599bd2009-10-06 11:31:12 -07002259{
Yan, Zhengf4b97862019-07-25 20:16:42 +08002260 struct ceph_file_info *fi = file->private_data;
Christoph Hellwig7ea80852010-05-26 17:53:25 +02002261 struct inode *inode = file->f_mapping->host;
Sage Weila8599bd2009-10-06 11:31:12 -07002262 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002263 u64 flush_tid;
Yan, Zhengf4b97862019-07-25 20:16:42 +08002264 int ret, err;
Sage Weila8599bd2009-10-06 11:31:12 -07002265 int dirty;
2266
2267 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
Yan, Zheng9a5530c2016-06-15 16:29:18 +08002268
Jeff Laytonb74fcea2017-07-25 10:50:41 -04002269 ret = file_write_and_wait_range(file, start, end);
Yan, Zhengda819c82015-05-27 11:19:34 +08002270 if (datasync)
2271 goto out;
2272
Jeff Layton891f3f52020-01-14 15:06:40 -05002273 ret = ceph_wait_on_async_create(inode);
2274 if (ret)
2275 goto out;
2276
Yan, Zheng553adfd2015-06-09 15:48:57 +08002277 dirty = try_flush_caps(inode, &flush_tid);
Sage Weila8599bd2009-10-06 11:31:12 -07002278 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
2279
Yan, Zhengf4b97862019-07-25 20:16:42 +08002280 err = unsafe_request_wait(inode);
Yan, Zhengda819c82015-05-27 11:19:34 +08002281
Sage Weila8599bd2009-10-06 11:31:12 -07002282 /*
2283 * only wait on non-file metadata writeback (the mds
2284 * can recover size and mtime, so we don't need to
2285 * wait for that)
2286 */
Yan, Zhengf4b97862019-07-25 20:16:42 +08002287 if (!err && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
2288 err = wait_event_interruptible(ci->i_cap_wq,
Yan, Zhengda819c82015-05-27 11:19:34 +08002289 caps_are_flushed(inode, flush_tid));
Sage Weila8599bd2009-10-06 11:31:12 -07002290 }
Yan, Zhengf4b97862019-07-25 20:16:42 +08002291
2292 if (err < 0)
2293 ret = err;
2294
2295 if (errseq_check(&ci->i_meta_err, READ_ONCE(fi->meta_err))) {
2296 spin_lock(&file->f_lock);
2297 err = errseq_check_and_advance(&ci->i_meta_err,
2298 &fi->meta_err);
2299 spin_unlock(&file->f_lock);
2300 if (err < 0)
2301 ret = err;
2302 }
Yan, Zhengda819c82015-05-27 11:19:34 +08002303out:
2304 dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
Sage Weila8599bd2009-10-06 11:31:12 -07002305 return ret;
2306}
2307
2308/*
2309 * Flush any dirty caps back to the mds. If we aren't asked to wait,
2310 * queue inode for flush but don't do so immediately, because we can
2311 * get by with fewer MDS messages if we wait for data writeback to
2312 * complete first.
2313 */
Stephen Rothwellf1a3d572010-01-18 11:53:08 +11002314int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
Sage Weila8599bd2009-10-06 11:31:12 -07002315{
2316 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002317 u64 flush_tid;
Sage Weila8599bd2009-10-06 11:31:12 -07002318 int err = 0;
2319 int dirty;
Chengguang Xu16515a62018-01-30 10:02:30 +08002320 int wait = (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync);
Sage Weila8599bd2009-10-06 11:31:12 -07002321
2322 dout("write_inode %p wait=%d\n", inode, wait);
2323 if (wait) {
Yan, Zheng553adfd2015-06-09 15:48:57 +08002324 dirty = try_flush_caps(inode, &flush_tid);
Sage Weila8599bd2009-10-06 11:31:12 -07002325 if (dirty)
2326 err = wait_event_interruptible(ci->i_cap_wq,
2327 caps_are_flushed(inode, flush_tid));
2328 } else {
Cheng Renquan640ef792010-03-26 17:40:33 +08002329 struct ceph_mds_client *mdsc =
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002330 ceph_sb_to_client(inode->i_sb)->mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07002331
Sage Weilbe655592011-11-30 09:47:09 -08002332 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002333 if (__ceph_caps_dirty(ci))
2334 __cap_delay_requeue_front(mdsc, ci);
Sage Weilbe655592011-11-30 09:47:09 -08002335 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002336 }
2337 return err;
2338}
2339
Yan, Zheng0e294382016-07-04 18:06:41 +08002340static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
2341 struct ceph_mds_session *session,
2342 struct ceph_inode_info *ci,
2343 u64 oldest_flush_tid)
2344 __releases(ci->i_ceph_lock)
2345 __acquires(ci->i_ceph_lock)
Yan, Zheng553adfd2015-06-09 15:48:57 +08002346{
2347 struct inode *inode = &ci->vfs_inode;
2348 struct ceph_cap *cap;
2349 struct ceph_cap_flush *cf;
Yan, Zheng0e294382016-07-04 18:06:41 +08002350 int ret;
Yan, Zheng553adfd2015-06-09 15:48:57 +08002351 u64 first_tid = 0;
Yan, Zheng49ada6e2019-06-20 12:09:08 +08002352 u64 last_snap_flush = 0;
Yan, Zhenga2971c82015-06-10 11:09:32 +08002353
Yan, Zheng054f8d42019-06-20 16:00:31 +08002354 ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
2355
Yan, Zheng49ada6e2019-06-20 12:09:08 +08002356 list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
2357 if (!cf->caps) {
2358 last_snap_flush = cf->tid;
2359 break;
2360 }
2361 }
2362
Yan, Zhenge4500b52016-07-06 11:12:56 +08002363 list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
2364 if (cf->tid < first_tid)
2365 continue;
2366
Yan, Zheng553adfd2015-06-09 15:48:57 +08002367 cap = ci->i_auth_cap;
2368 if (!(cap && cap->session == session)) {
Yan, Zheng0e294382016-07-04 18:06:41 +08002369 pr_err("%p auth cap %p not mds%d ???\n",
2370 inode, cap, session->s_mds);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002371 break;
2372 }
2373
Yan, Zheng553adfd2015-06-09 15:48:57 +08002374 first_tid = cf->tid + 1;
2375
Yan, Zheng0e294382016-07-04 18:06:41 +08002376 if (cf->caps) {
Jeff Layton0a454bd2020-03-17 08:47:31 -04002377 struct cap_msg_args arg;
2378
Yan, Zheng0e294382016-07-04 18:06:41 +08002379 dout("kick_flushing_caps %p cap %p tid %llu %s\n",
2380 inode, cap, cf->tid, ceph_cap_string(cf->caps));
Jeff Layton0a454bd2020-03-17 08:47:31 -04002381 __prep_cap(&arg, cap, CEPH_CAP_OP_FLUSH,
Yan, Zheng49ada6e2019-06-20 12:09:08 +08002382 (cf->tid < last_snap_flush ?
2383 CEPH_CLIENT_CAPS_PENDING_CAPSNAP : 0),
2384 __ceph_caps_used(ci),
Yan, Zheng0e294382016-07-04 18:06:41 +08002385 __ceph_caps_wanted(ci),
Yan, Zheng49ada6e2019-06-20 12:09:08 +08002386 (cap->issued | cap->implemented),
Yan, Zheng0e294382016-07-04 18:06:41 +08002387 cf->caps, cf->tid, oldest_flush_tid);
Jeff Layton0a454bd2020-03-17 08:47:31 -04002388 spin_unlock(&ci->i_ceph_lock);
2389 __send_cap(mdsc, &arg, ci);
Yan, Zheng0e294382016-07-04 18:06:41 +08002390 } else {
2391 struct ceph_cap_snap *capsnap =
2392 container_of(cf, struct ceph_cap_snap,
2393 cap_flush);
2394 dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
2395 inode, capsnap, cf->tid,
2396 ceph_cap_string(capsnap->dirty));
2397
Elena Reshetova805692d2017-03-03 11:15:07 +02002398 refcount_inc(&capsnap->nref);
Yan, Zheng0e294382016-07-04 18:06:41 +08002399 spin_unlock(&ci->i_ceph_lock);
2400
2401 ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
2402 oldest_flush_tid);
2403 if (ret < 0) {
2404 pr_err("kick_flushing_caps: error sending "
2405 "cap flushsnap, ino (%llx.%llx) "
2406 "tid %llu follows %llu\n",
2407 ceph_vinop(inode), cf->tid,
2408 capsnap->follows);
2409 }
2410
2411 ceph_put_cap_snap(capsnap);
2412 }
Yan, Zhenge4500b52016-07-06 11:12:56 +08002413
2414 spin_lock(&ci->i_ceph_lock);
Yan, Zheng553adfd2015-06-09 15:48:57 +08002415 }
Yan, Zheng553adfd2015-06-09 15:48:57 +08002416}
2417
Yan, Zhenge548e9b2015-06-10 15:17:56 +08002418void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
2419 struct ceph_mds_session *session)
2420{
2421 struct ceph_inode_info *ci;
2422 struct ceph_cap *cap;
Yan, Zheng0e294382016-07-04 18:06:41 +08002423 u64 oldest_flush_tid;
Yan, Zhenge548e9b2015-06-10 15:17:56 +08002424
2425 dout("early_kick_flushing_caps mds%d\n", session->s_mds);
Yan, Zheng0e294382016-07-04 18:06:41 +08002426
2427 spin_lock(&mdsc->cap_dirty_lock);
2428 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2429 spin_unlock(&mdsc->cap_dirty_lock);
2430
Yan, Zhenge548e9b2015-06-10 15:17:56 +08002431 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
2432 spin_lock(&ci->i_ceph_lock);
2433 cap = ci->i_auth_cap;
2434 if (!(cap && cap->session == session)) {
2435 pr_err("%p auth cap %p not mds%d ???\n",
2436 &ci->vfs_inode, cap, session->s_mds);
2437 spin_unlock(&ci->i_ceph_lock);
2438 continue;
2439 }
2440
2441
2442 /*
2443 * if flushing caps were revoked, we re-send the cap flush
2444 * in client reconnect stage. This guarantees MDS * processes
2445 * the cap flush message before issuing the flushing caps to
2446 * other client.
2447 */
2448 if ((cap->issued & ci->i_flushing_caps) !=
2449 ci->i_flushing_caps) {
Yan, Zheng81c5a142019-01-01 16:28:33 +08002450 /* encode_caps_cb() also will reset these sequence
2451 * numbers. make sure sequence numbers in cap flush
2452 * message match later reconnect message */
2453 cap->seq = 0;
2454 cap->issue_seq = 0;
2455 cap->mseq = 0;
Yan, Zheng0e294382016-07-04 18:06:41 +08002456 __kick_flushing_caps(mdsc, session, ci,
2457 oldest_flush_tid);
Yan, Zheng13c2b572016-07-05 16:45:21 +08002458 } else {
2459 ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
Yan, Zhenge548e9b2015-06-10 15:17:56 +08002460 }
2461
Yan, Zhenge548e9b2015-06-10 15:17:56 +08002462 spin_unlock(&ci->i_ceph_lock);
2463 }
2464}
2465
Sage Weila8599bd2009-10-06 11:31:12 -07002466void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
2467 struct ceph_mds_session *session)
2468{
2469 struct ceph_inode_info *ci;
Yan, Zheng13c2b572016-07-05 16:45:21 +08002470 struct ceph_cap *cap;
Yan, Zheng0e294382016-07-04 18:06:41 +08002471 u64 oldest_flush_tid;
Sage Weila8599bd2009-10-06 11:31:12 -07002472
2473 dout("kick_flushing_caps mds%d\n", session->s_mds);
Yan, Zheng0e294382016-07-04 18:06:41 +08002474
2475 spin_lock(&mdsc->cap_dirty_lock);
2476 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
2477 spin_unlock(&mdsc->cap_dirty_lock);
2478
Sage Weila8599bd2009-10-06 11:31:12 -07002479 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
Yan, Zheng0e294382016-07-04 18:06:41 +08002480 spin_lock(&ci->i_ceph_lock);
Yan, Zheng13c2b572016-07-05 16:45:21 +08002481 cap = ci->i_auth_cap;
2482 if (!(cap && cap->session == session)) {
2483 pr_err("%p auth cap %p not mds%d ???\n",
2484 &ci->vfs_inode, cap, session->s_mds);
2485 spin_unlock(&ci->i_ceph_lock);
2486 continue;
2487 }
2488 if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
Yan, Zheng13c2b572016-07-05 16:45:21 +08002489 __kick_flushing_caps(mdsc, session, ci,
2490 oldest_flush_tid);
2491 }
Yan, Zheng0e294382016-07-04 18:06:41 +08002492 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002493 }
2494}
2495
Jeff Laytone8a4d262020-02-25 11:49:53 -08002496void ceph_kick_flushing_inode_caps(struct ceph_mds_session *session,
2497 struct ceph_inode_info *ci)
Sage Weil088b3f52011-01-18 08:56:01 -08002498{
Jeff Laytone8a4d262020-02-25 11:49:53 -08002499 struct ceph_mds_client *mdsc = session->s_mdsc;
2500 struct ceph_cap *cap = ci->i_auth_cap;
Sage Weil088b3f52011-01-18 08:56:01 -08002501
Jeff Laytone8a4d262020-02-25 11:49:53 -08002502 lockdep_assert_held(&ci->i_ceph_lock);
2503
2504 dout("%s %p flushing %s\n", __func__, &ci->vfs_inode,
Yan, Zheng8310b082015-06-09 17:20:12 +08002505 ceph_cap_string(ci->i_flushing_caps));
Yan, Zheng005c4692013-05-31 16:40:24 +08002506
Yan, Zheng0e294382016-07-04 18:06:41 +08002507 if (!list_empty(&ci->i_cap_flush_list)) {
2508 u64 oldest_flush_tid;
Yan, Zheng005c4692013-05-31 16:40:24 +08002509 spin_lock(&mdsc->cap_dirty_lock);
2510 list_move_tail(&ci->i_flushing_item,
2511 &cap->session->s_cap_flushing);
Yan, Zheng0e294382016-07-04 18:06:41 +08002512 oldest_flush_tid = __get_oldest_flush_tid(mdsc);
Yan, Zheng005c4692013-05-31 16:40:24 +08002513 spin_unlock(&mdsc->cap_dirty_lock);
2514
Yan, Zheng0e294382016-07-04 18:06:41 +08002515 __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
Sage Weil088b3f52011-01-18 08:56:01 -08002516 }
2517}
2518
Sage Weila8599bd2009-10-06 11:31:12 -07002519
2520/*
2521 * Take references to capabilities we hold, so that we don't release
2522 * them to the MDS prematurely.
Sage Weila8599bd2009-10-06 11:31:12 -07002523 */
Jeff Layton40dcf752020-01-14 09:23:49 -05002524void ceph_take_cap_refs(struct ceph_inode_info *ci, int got,
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002525 bool snap_rwsem_locked)
Sage Weila8599bd2009-10-06 11:31:12 -07002526{
Jeff Layton40dcf752020-01-14 09:23:49 -05002527 lockdep_assert_held(&ci->i_ceph_lock);
2528
Sage Weila8599bd2009-10-06 11:31:12 -07002529 if (got & CEPH_CAP_PIN)
2530 ci->i_pin_ref++;
2531 if (got & CEPH_CAP_FILE_RD)
2532 ci->i_rd_ref++;
2533 if (got & CEPH_CAP_FILE_CACHE)
2534 ci->i_rdcache_ref++;
Jeff Laytonf85122a2019-04-02 08:04:30 -04002535 if (got & CEPH_CAP_FILE_EXCL)
2536 ci->i_fx_ref++;
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002537 if (got & CEPH_CAP_FILE_WR) {
2538 if (ci->i_wr_ref == 0 && !ci->i_head_snapc) {
2539 BUG_ON(!snap_rwsem_locked);
2540 ci->i_head_snapc = ceph_get_snap_context(
2541 ci->i_snap_realm->cached_context);
2542 }
Sage Weila8599bd2009-10-06 11:31:12 -07002543 ci->i_wr_ref++;
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002544 }
Sage Weila8599bd2009-10-06 11:31:12 -07002545 if (got & CEPH_CAP_FILE_BUFFER) {
Henry C Changd3d07202011-05-11 10:29:54 +00002546 if (ci->i_wb_ref == 0)
Sage Weil3772d262011-05-03 09:28:08 -07002547 ihold(&ci->vfs_inode);
Henry C Changd3d07202011-05-11 10:29:54 +00002548 ci->i_wb_ref++;
Jeff Layton40dcf752020-01-14 09:23:49 -05002549 dout("%s %p wb %d -> %d (?)\n", __func__,
Henry C Changd3d07202011-05-11 10:29:54 +00002550 &ci->vfs_inode, ci->i_wb_ref-1, ci->i_wb_ref);
Sage Weila8599bd2009-10-06 11:31:12 -07002551 }
2552}
2553
2554/*
2555 * Try to grab cap references. Specify those refs we @want, and the
2556 * minimal set we @need. Also include the larger offset we are writing
2557 * to (when applicable), and check against max_size here as well.
2558 * Note that caller is responsible for ensuring max_size increases are
2559 * requested from the MDS.
Jeff Layton1199d7d2019-04-02 15:58:05 -04002560 *
Yan, Zheng546d4022020-03-10 19:34:18 +08002561 * Returns 0 if caps were not able to be acquired (yet), 1 if succeed,
2562 * or a negative error code. There are 3 speical error codes:
2563 * -EAGAIN: need to sleep but non-blocking is specified
2564 * -EFBIG: ask caller to call check_max_size() and try again.
2565 * -ESTALE: ask caller to call ceph_renew_caps() and try again.
Sage Weila8599bd2009-10-06 11:31:12 -07002566 */
Yan, Zhengff5d9132019-07-25 20:16:45 +08002567enum {
Yan, Zheng719a2512020-03-05 20:21:00 +08002568 /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */
2569 NON_BLOCKING = (1 << 8),
2570 CHECK_FILELOCK = (1 << 9),
Yan, Zhengff5d9132019-07-25 20:16:45 +08002571};
2572
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002573static int try_get_cap_refs(struct inode *inode, int need, int want,
Yan, Zhengff5d9132019-07-25 20:16:45 +08002574 loff_t endoff, int flags, int *got)
Sage Weila8599bd2009-10-06 11:31:12 -07002575{
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002576 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002577 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07002578 int ret = 0;
Yan, Zhengc4d4a582015-01-09 15:56:18 +08002579 int have, implemented;
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002580 bool snap_rwsem_locked = false;
Sage Weila8599bd2009-10-06 11:31:12 -07002581
2582 dout("get_cap_refs %p need %s want %s\n", inode,
2583 ceph_cap_string(need), ceph_cap_string(want));
Yan, Zhengc4d4a582015-01-09 15:56:18 +08002584
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002585again:
Sage Weilbe655592011-11-30 09:47:09 -08002586 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002587
Yan, Zhengff5d9132019-07-25 20:16:45 +08002588 if ((flags & CHECK_FILELOCK) &&
2589 (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK)) {
2590 dout("try_get_cap_refs %p error filelock\n", inode);
2591 ret = -EIO;
2592 goto out_unlock;
2593 }
2594
Yan, Zheng37505d52013-04-12 16:11:10 +08002595 /* finish pending truncate */
2596 while (ci->i_truncate_pending) {
2597 spin_unlock(&ci->i_ceph_lock);
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002598 if (snap_rwsem_locked) {
2599 up_read(&mdsc->snap_rwsem);
2600 snap_rwsem_locked = false;
2601 }
Yan, Zhengb415bf42013-07-02 12:40:19 +08002602 __ceph_do_pending_vmtruncate(inode);
Yan, Zheng37505d52013-04-12 16:11:10 +08002603 spin_lock(&ci->i_ceph_lock);
2604 }
2605
Yan, Zheng3871cbb2013-08-05 14:10:29 +08002606 have = __ceph_caps_issued(ci, &implemented);
2607
2608 if (have & need & CEPH_CAP_FILE_WR) {
Sage Weila8599bd2009-10-06 11:31:12 -07002609 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
2610 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
2611 inode, endoff, ci->i_max_size);
Jeff Layton1199d7d2019-04-02 15:58:05 -04002612 if (endoff > ci->i_requested_max_size)
Yan, Zheng42d70f82020-03-10 19:34:19 +08002613 ret = ci->i_auth_cap ? -EFBIG : -ESTALE;
Yan, Zheng3738daa2014-11-14 22:10:07 +08002614 goto out_unlock;
Sage Weila8599bd2009-10-06 11:31:12 -07002615 }
2616 /*
2617 * If a sync write is in progress, we must wait, so that we
2618 * can get a final snapshot value for size+mtime.
2619 */
2620 if (__ceph_have_pending_cap_snap(ci)) {
2621 dout("get_cap_refs %p cap_snap_pending\n", inode);
Yan, Zheng3738daa2014-11-14 22:10:07 +08002622 goto out_unlock;
Sage Weila8599bd2009-10-06 11:31:12 -07002623 }
2624 }
Sage Weila8599bd2009-10-06 11:31:12 -07002625
Sage Weila8599bd2009-10-06 11:31:12 -07002626 if ((have & need) == need) {
2627 /*
2628 * Look at (implemented & ~have & not) so that we keep waiting
2629 * on transition from wanted -> needed caps. This is needed
2630 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
2631 * going before a prior buffered writeback happens.
2632 */
2633 int not = want & ~(have & need);
2634 int revoking = implemented & ~have;
2635 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
2636 inode, ceph_cap_string(have), ceph_cap_string(not),
2637 ceph_cap_string(revoking));
2638 if ((revoking & not) == 0) {
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002639 if (!snap_rwsem_locked &&
2640 !ci->i_head_snapc &&
2641 (need & CEPH_CAP_FILE_WR)) {
2642 if (!down_read_trylock(&mdsc->snap_rwsem)) {
2643 /*
2644 * we can not call down_read() when
2645 * task isn't in TASK_RUNNING state
2646 */
Yan, Zhengff5d9132019-07-25 20:16:45 +08002647 if (flags & NON_BLOCKING) {
Jeff Layton1199d7d2019-04-02 15:58:05 -04002648 ret = -EAGAIN;
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002649 goto out_unlock;
2650 }
2651
2652 spin_unlock(&ci->i_ceph_lock);
2653 down_read(&mdsc->snap_rwsem);
2654 snap_rwsem_locked = true;
2655 goto again;
2656 }
2657 snap_rwsem_locked = true;
2658 }
Yan, Zheng173e70e2020-02-18 08:17:08 -05002659 if ((have & want) == want)
2660 *got = need | want;
2661 else
2662 *got = need;
Yan, Zheng525d15e2019-05-11 17:27:59 +08002663 if (S_ISREG(inode->i_mode) &&
2664 (need & CEPH_CAP_FILE_RD) &&
Yan, Zhengf7f7e7a2016-05-18 20:31:55 +08002665 !(*got & CEPH_CAP_FILE_CACHE))
2666 ceph_disable_fscache_readpage(ci);
Jeff Layton40dcf752020-01-14 09:23:49 -05002667 ceph_take_cap_refs(ci, *got, true);
Sage Weila8599bd2009-10-06 11:31:12 -07002668 ret = 1;
2669 }
2670 } else {
Yan, Zheng03f4fcb2015-01-05 11:04:04 +08002671 int session_readonly = false;
Yan, Zhengc0e385b2020-03-05 20:20:59 +08002672 int mds_wanted;
Yan, Zheng525d15e2019-05-11 17:27:59 +08002673 if (ci->i_auth_cap &&
2674 (need & (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_EXCL))) {
Yan, Zheng03f4fcb2015-01-05 11:04:04 +08002675 struct ceph_mds_session *s = ci->i_auth_cap->session;
2676 spin_lock(&s->s_cap_lock);
2677 session_readonly = s->s_readonly;
2678 spin_unlock(&s->s_cap_lock);
2679 }
2680 if (session_readonly) {
Yan, Zhengc0e385b2020-03-05 20:20:59 +08002681 dout("get_cap_refs %p need %s but mds%d readonly\n",
Yan, Zheng03f4fcb2015-01-05 11:04:04 +08002682 inode, ceph_cap_string(need), ci->i_auth_cap->mds);
Jeff Layton1199d7d2019-04-02 15:58:05 -04002683 ret = -EROFS;
Yan, Zheng03f4fcb2015-01-05 11:04:04 +08002684 goto out_unlock;
2685 }
2686
Yan, Zhengc0e385b2020-03-05 20:20:59 +08002687 if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
2688 dout("get_cap_refs %p forced umount\n", inode);
2689 ret = -EIO;
2690 goto out_unlock;
2691 }
2692 mds_wanted = __ceph_caps_mds_wanted(ci, false);
2693 if (need & ~mds_wanted) {
2694 dout("get_cap_refs %p need %s > mds_wanted %s\n",
2695 inode, ceph_cap_string(need),
2696 ceph_cap_string(mds_wanted));
2697 ret = -ESTALE;
2698 goto out_unlock;
Yan, Zheng48fec5d2015-07-01 16:27:46 +08002699 }
2700
Yan, Zhengc0e385b2020-03-05 20:20:59 +08002701 dout("get_cap_refs %p have %s need %s\n", inode,
Sage Weila8599bd2009-10-06 11:31:12 -07002702 ceph_cap_string(have), ceph_cap_string(need));
2703 }
Yan, Zheng3738daa2014-11-14 22:10:07 +08002704out_unlock:
Yan, Zheng719a2512020-03-05 20:21:00 +08002705
2706 __ceph_touch_fmode(ci, mdsc, flags);
2707
Sage Weilbe655592011-11-30 09:47:09 -08002708 spin_unlock(&ci->i_ceph_lock);
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002709 if (snap_rwsem_locked)
2710 up_read(&mdsc->snap_rwsem);
Yan, Zheng3738daa2014-11-14 22:10:07 +08002711
Xiubo Li1af16d52020-03-19 23:45:00 -04002712 if (!ret)
2713 ceph_update_cap_mis(&mdsc->metric);
2714 else if (ret == 1)
2715 ceph_update_cap_hit(&mdsc->metric);
2716
Sage Weila8599bd2009-10-06 11:31:12 -07002717 dout("get_cap_refs %p ret %d got %s\n", inode,
Yan, Zhengc4d4a582015-01-09 15:56:18 +08002718 ret, ceph_cap_string(*got));
Sage Weila8599bd2009-10-06 11:31:12 -07002719 return ret;
2720}
2721
2722/*
2723 * Check the offset we are writing up to against our current
2724 * max_size. If necessary, tell the MDS we want to write to
2725 * a larger offset.
2726 */
2727static void check_max_size(struct inode *inode, loff_t endoff)
2728{
2729 struct ceph_inode_info *ci = ceph_inode(inode);
2730 int check = 0;
2731
2732 /* do we need to explicitly request a larger max_size? */
Sage Weilbe655592011-11-30 09:47:09 -08002733 spin_lock(&ci->i_ceph_lock);
Yan, Zheng3871cbb2013-08-05 14:10:29 +08002734 if (endoff >= ci->i_max_size && endoff > ci->i_wanted_max_size) {
Sage Weila8599bd2009-10-06 11:31:12 -07002735 dout("write %p at large endoff %llu, req max_size\n",
2736 inode, endoff);
2737 ci->i_wanted_max_size = endoff;
Sage Weila8599bd2009-10-06 11:31:12 -07002738 }
Yan, Zheng3871cbb2013-08-05 14:10:29 +08002739 /* duplicate ceph_check_caps()'s logic */
2740 if (ci->i_auth_cap &&
2741 (ci->i_auth_cap->issued & CEPH_CAP_FILE_WR) &&
2742 ci->i_wanted_max_size > ci->i_max_size &&
2743 ci->i_wanted_max_size > ci->i_requested_max_size)
2744 check = 1;
Sage Weilbe655592011-11-30 09:47:09 -08002745 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002746 if (check)
2747 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2748}
2749
Yan, Zheng719a2512020-03-05 20:21:00 +08002750static inline int get_used_fmode(int caps)
2751{
2752 int fmode = 0;
2753 if (caps & CEPH_CAP_FILE_RD)
2754 fmode |= CEPH_FILE_MODE_RD;
2755 if (caps & CEPH_CAP_FILE_WR)
2756 fmode |= CEPH_FILE_MODE_WR;
2757 return fmode;
2758}
2759
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002760int ceph_try_get_caps(struct inode *inode, int need, int want,
Luis Henriques2ee9dd92018-10-15 16:45:57 +01002761 bool nonblock, int *got)
Yan, Zheng2b1ac852016-10-25 10:51:55 +08002762{
Yan, Zheng719a2512020-03-05 20:21:00 +08002763 int ret, flags;
Yan, Zheng2b1ac852016-10-25 10:51:55 +08002764
2765 BUG_ON(need & ~CEPH_CAP_FILE_RD);
Jeff Laytona25949b2020-02-18 14:12:45 -05002766 BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
2767 CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
2768 CEPH_CAP_ANY_DIR_OPS));
2769 if (need) {
2770 ret = ceph_pool_perm_check(inode, need);
2771 if (ret < 0)
2772 return ret;
2773 }
Yan, Zheng2b1ac852016-10-25 10:51:55 +08002774
Yan, Zheng719a2512020-03-05 20:21:00 +08002775 flags = get_used_fmode(need | want);
2776 if (nonblock)
2777 flags |= NON_BLOCKING;
2778
2779 ret = try_get_cap_refs(inode, need, want, 0, flags, got);
Yan, Zheng546d4022020-03-10 19:34:18 +08002780 /* three special error codes */
Wu Bo7d8976a2020-04-29 10:01:55 +08002781 if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE)
Yan, Zheng546d4022020-03-10 19:34:18 +08002782 ret = 0;
2783 return ret;
Yan, Zheng2b1ac852016-10-25 10:51:55 +08002784}
2785
Sage Weila8599bd2009-10-06 11:31:12 -07002786/*
2787 * Wait for caps, and take cap references. If we can't get a WR cap
2788 * due to a small max_size, make sure we check_max_size (and possibly
2789 * ask the mds) so we don't get hung up indefinitely.
2790 */
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002791int ceph_get_caps(struct file *filp, int need, int want,
Yan, Zheng3738daa2014-11-14 22:10:07 +08002792 loff_t endoff, int *got, struct page **pinned_page)
Sage Weila8599bd2009-10-06 11:31:12 -07002793{
Yan, Zhengff5d9132019-07-25 20:16:45 +08002794 struct ceph_file_info *fi = filp->private_data;
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002795 struct inode *inode = file_inode(filp);
2796 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng81f148a2019-07-25 20:16:46 +08002797 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
Yan, Zhengff5d9132019-07-25 20:16:45 +08002798 int ret, _got, flags;
Sage Weila8599bd2009-10-06 11:31:12 -07002799
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002800 ret = ceph_pool_perm_check(inode, need);
Yan, Zheng10183a62015-04-27 15:33:28 +08002801 if (ret < 0)
2802 return ret;
2803
Yan, Zheng81f148a2019-07-25 20:16:46 +08002804 if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2805 fi->filp_gen != READ_ONCE(fsc->filp_gen))
2806 return -EBADF;
2807
Yan, Zheng719a2512020-03-05 20:21:00 +08002808 flags = get_used_fmode(need | want);
2809
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002810 while (true) {
Yan, Zheng719a2512020-03-05 20:21:00 +08002811 flags &= CEPH_FILE_MODE_MASK;
2812 if (atomic_read(&fi->num_locks))
2813 flags |= CHECK_FILELOCK;
Yan, Zhengc4d4a582015-01-09 15:56:18 +08002814 _got = 0;
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002815 ret = try_get_cap_refs(inode, need, want, endoff,
Yan, Zhengff5d9132019-07-25 20:16:45 +08002816 flags, &_got);
Yan, Zheng546d4022020-03-10 19:34:18 +08002817 WARN_ON_ONCE(ret == -EAGAIN);
Yan, Zheng7b2f9362019-05-20 09:50:09 +08002818 if (!ret) {
Jeff Layton3a3430a2019-11-20 12:00:59 -05002819 struct ceph_mds_client *mdsc = fsc->mdsc;
2820 struct cap_wait cw;
Nikolay Borisov5c341ee32016-10-11 12:04:09 +03002821 DEFINE_WAIT_FUNC(wait, woken_wake_function);
Jeff Layton3a3430a2019-11-20 12:00:59 -05002822
2823 cw.ino = inode->i_ino;
2824 cw.tgid = current->tgid;
2825 cw.need = need;
2826 cw.want = want;
2827
2828 spin_lock(&mdsc->caps_list_lock);
2829 list_add(&cw.list, &mdsc->cap_wait_list);
2830 spin_unlock(&mdsc->caps_list_lock);
2831
Yan, Zheng719a2512020-03-05 20:21:00 +08002832 /* make sure used fmode not timeout */
2833 ceph_get_fmode(ci, flags, FMODE_WAIT_BIAS);
Nikolay Borisov5c341ee32016-10-11 12:04:09 +03002834 add_wait_queue(&ci->i_cap_wq, &wait);
2835
Yan, Zhengff5d9132019-07-25 20:16:45 +08002836 flags |= NON_BLOCKING;
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002837 while (!(ret = try_get_cap_refs(inode, need, want,
Yan, Zhengff5d9132019-07-25 20:16:45 +08002838 endoff, flags, &_got))) {
Yan, Zheng6e09d0f2016-12-22 16:05:43 +08002839 if (signal_pending(current)) {
2840 ret = -ERESTARTSYS;
2841 break;
2842 }
Nikolay Borisov5c341ee32016-10-11 12:04:09 +03002843 wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
Yan, Zheng6e09d0f2016-12-22 16:05:43 +08002844 }
Nikolay Borisov5c341ee32016-10-11 12:04:09 +03002845
2846 remove_wait_queue(&ci->i_cap_wq, &wait);
Yan, Zheng719a2512020-03-05 20:21:00 +08002847 ceph_put_fmode(ci, flags, FMODE_WAIT_BIAS);
Jeff Layton3a3430a2019-11-20 12:00:59 -05002848
2849 spin_lock(&mdsc->caps_list_lock);
2850 list_del(&cw.list);
2851 spin_unlock(&mdsc->caps_list_lock);
2852
Yan, Zheng7b2f9362019-05-20 09:50:09 +08002853 if (ret == -EAGAIN)
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002854 continue;
Yan, Zheng77310322016-04-08 15:27:16 +08002855 }
Yan, Zheng81f148a2019-07-25 20:16:46 +08002856
2857 if ((fi->fmode & CEPH_FILE_MODE_WR) &&
2858 fi->filp_gen != READ_ONCE(fsc->filp_gen)) {
2859 if (ret >= 0 && _got)
2860 ceph_put_cap_refs(ci, _got);
2861 return -EBADF;
2862 }
2863
Yan, Zheng7b2f9362019-05-20 09:50:09 +08002864 if (ret < 0) {
Yan, Zheng9bccb762020-03-10 19:34:21 +08002865 if (ret == -EFBIG || ret == -ESTALE) {
2866 int ret2 = ceph_wait_on_async_create(inode);
2867 if (ret2 < 0)
2868 return ret2;
2869 }
Yan, Zheng546d4022020-03-10 19:34:18 +08002870 if (ret == -EFBIG) {
2871 check_max_size(inode, endoff);
2872 continue;
2873 }
Yan, Zheng7b2f9362019-05-20 09:50:09 +08002874 if (ret == -ESTALE) {
2875 /* session was killed, try renew caps */
Yan, Zheng719a2512020-03-05 20:21:00 +08002876 ret = ceph_renew_caps(inode, flags);
Yan, Zheng7b2f9362019-05-20 09:50:09 +08002877 if (ret == 0)
2878 continue;
2879 }
Yan, Zheng77310322016-04-08 15:27:16 +08002880 return ret;
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002881 }
Yan, Zhengc4d4a582015-01-09 15:56:18 +08002882
Yan, Zheng525d15e2019-05-11 17:27:59 +08002883 if (S_ISREG(ci->vfs_inode.i_mode) &&
2884 ci->i_inline_version != CEPH_INLINE_NONE &&
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002885 (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002886 i_size_read(inode) > 0) {
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002887 struct page *page =
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002888 find_get_page(inode->i_mapping, 0);
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002889 if (page) {
2890 if (PageUptodate(page)) {
2891 *pinned_page = page;
2892 break;
2893 }
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03002894 put_page(page);
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002895 }
2896 /*
2897 * drop cap refs first because getattr while
2898 * holding * caps refs can cause deadlock.
2899 */
2900 ceph_put_cap_refs(ci, _got);
2901 _got = 0;
2902
2903 /*
2904 * getattr request will bring inline data into
2905 * page cache
2906 */
Yan, Zheng5e3ded12019-07-25 20:16:43 +08002907 ret = __ceph_do_getattr(inode, NULL,
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002908 CEPH_STAT_CAP_INLINE_DATA,
2909 true);
2910 if (ret < 0)
2911 return ret;
2912 continue;
2913 }
2914 break;
Yan, Zhengc4d4a582015-01-09 15:56:18 +08002915 }
Yan, Zheng5dda377c2015-04-30 14:40:54 +08002916
Yan, Zheng525d15e2019-05-11 17:27:59 +08002917 if (S_ISREG(ci->vfs_inode.i_mode) &&
2918 (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE))
Yan, Zhengf7f7e7a2016-05-18 20:31:55 +08002919 ceph_fscache_revalidate_cookie(ci);
2920
Yan, Zhengc4d4a582015-01-09 15:56:18 +08002921 *got = _got;
2922 return 0;
Sage Weila8599bd2009-10-06 11:31:12 -07002923}
2924
2925/*
2926 * Take cap refs. Caller must already know we hold at least one ref
2927 * on the caps in question or we don't know this is safe.
2928 */
2929void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2930{
Sage Weilbe655592011-11-30 09:47:09 -08002931 spin_lock(&ci->i_ceph_lock);
Jeff Layton40dcf752020-01-14 09:23:49 -05002932 ceph_take_cap_refs(ci, caps, false);
Sage Weilbe655592011-11-30 09:47:09 -08002933 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002934}
2935
Yan, Zheng86056092015-05-01 16:57:16 +08002936
2937/*
2938 * drop cap_snap that is not associated with any snapshot.
2939 * we don't need to send FLUSHSNAP message for it.
2940 */
Yan, Zheng70220ac2016-07-06 16:21:30 +08002941static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
2942 struct ceph_cap_snap *capsnap)
Yan, Zheng86056092015-05-01 16:57:16 +08002943{
2944 if (!capsnap->need_flush &&
2945 !capsnap->writing && !capsnap->dirty_pages) {
Yan, Zheng86056092015-05-01 16:57:16 +08002946 dout("dropping cap_snap %p follows %llu\n",
2947 capsnap, capsnap->follows);
Yan, Zheng0e294382016-07-04 18:06:41 +08002948 BUG_ON(capsnap->cap_flush.tid > 0);
Yan, Zheng86056092015-05-01 16:57:16 +08002949 ceph_put_snap_context(capsnap->context);
Yan, Zheng70220ac2016-07-06 16:21:30 +08002950 if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
2951 ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
2952
Yan, Zheng86056092015-05-01 16:57:16 +08002953 list_del(&capsnap->ci_item);
Yan, Zheng86056092015-05-01 16:57:16 +08002954 ceph_put_cap_snap(capsnap);
2955 return 1;
2956 }
2957 return 0;
2958}
2959
Sage Weila8599bd2009-10-06 11:31:12 -07002960/*
2961 * Release cap refs.
2962 *
2963 * If we released the last ref on any given cap, call ceph_check_caps
2964 * to release (or schedule a release).
2965 *
2966 * If we are releasing a WR cap (from a sync write), finalize any affected
2967 * cap_snap, and wake up any waiters.
2968 */
2969void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2970{
2971 struct inode *inode = &ci->vfs_inode;
2972 int last = 0, put = 0, flushsnaps = 0, wake = 0;
Sage Weila8599bd2009-10-06 11:31:12 -07002973
Sage Weilbe655592011-11-30 09:47:09 -08002974 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07002975 if (had & CEPH_CAP_PIN)
2976 --ci->i_pin_ref;
2977 if (had & CEPH_CAP_FILE_RD)
2978 if (--ci->i_rd_ref == 0)
2979 last++;
2980 if (had & CEPH_CAP_FILE_CACHE)
2981 if (--ci->i_rdcache_ref == 0)
2982 last++;
Jeff Laytonf85122a2019-04-02 08:04:30 -04002983 if (had & CEPH_CAP_FILE_EXCL)
2984 if (--ci->i_fx_ref == 0)
2985 last++;
Sage Weila8599bd2009-10-06 11:31:12 -07002986 if (had & CEPH_CAP_FILE_BUFFER) {
Henry C Changd3d07202011-05-11 10:29:54 +00002987 if (--ci->i_wb_ref == 0) {
Sage Weila8599bd2009-10-06 11:31:12 -07002988 last++;
2989 put++;
2990 }
Henry C Changd3d07202011-05-11 10:29:54 +00002991 dout("put_cap_refs %p wb %d -> %d (?)\n",
2992 inode, ci->i_wb_ref+1, ci->i_wb_ref);
Sage Weila8599bd2009-10-06 11:31:12 -07002993 }
2994 if (had & CEPH_CAP_FILE_WR)
2995 if (--ci->i_wr_ref == 0) {
2996 last++;
Yan, Zheng86056092015-05-01 16:57:16 +08002997 if (__ceph_have_pending_cap_snap(ci)) {
2998 struct ceph_cap_snap *capsnap =
2999 list_last_entry(&ci->i_cap_snaps,
3000 struct ceph_cap_snap,
3001 ci_item);
3002 capsnap->writing = 0;
Yan, Zheng70220ac2016-07-06 16:21:30 +08003003 if (ceph_try_drop_cap_snap(ci, capsnap))
Yan, Zheng86056092015-05-01 16:57:16 +08003004 put++;
3005 else if (__ceph_finish_cap_snap(ci, capsnap))
3006 flushsnaps = 1;
3007 wake = 1;
Sage Weila8599bd2009-10-06 11:31:12 -07003008 }
Yan, Zheng5dda377c2015-04-30 14:40:54 +08003009 if (ci->i_wrbuffer_ref_head == 0 &&
3010 ci->i_dirty_caps == 0 &&
3011 ci->i_flushing_caps == 0) {
3012 BUG_ON(!ci->i_head_snapc);
3013 ceph_put_snap_context(ci->i_head_snapc);
3014 ci->i_head_snapc = NULL;
3015 }
Yan, Zhengdb40cc12015-03-23 20:12:20 +08003016 /* see comment in __ceph_remove_cap() */
Xiubo Libd84fbc2019-12-03 03:00:51 -05003017 if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
Yan, Zhengdb40cc12015-03-23 20:12:20 +08003018 drop_inode_snap_realm(ci);
Sage Weila8599bd2009-10-06 11:31:12 -07003019 }
Sage Weilbe655592011-11-30 09:47:09 -08003020 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07003021
Sage Weil819ccbf2010-04-01 09:33:46 -07003022 dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had),
3023 last ? " last" : "", put ? " put" : "");
Sage Weila8599bd2009-10-06 11:31:12 -07003024
Yan, Zhenga0d93e32020-03-05 20:21:01 +08003025 if (last)
Sage Weila8599bd2009-10-06 11:31:12 -07003026 ceph_check_caps(ci, 0, NULL);
3027 else if (flushsnaps)
Yan, Zhenged9b4302016-07-05 21:08:07 +08003028 ceph_flush_snaps(ci, NULL);
Sage Weila8599bd2009-10-06 11:31:12 -07003029 if (wake)
Yehuda Sadeh03066f22010-07-27 13:11:08 -07003030 wake_up_all(&ci->i_cap_wq);
Yan, Zheng86056092015-05-01 16:57:16 +08003031 while (put-- > 0)
Sage Weila8599bd2009-10-06 11:31:12 -07003032 iput(inode);
3033}
3034
3035/*
3036 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
3037 * context. Adjust per-snap dirty page accounting as appropriate.
3038 * Once all dirty data for a cap_snap is flushed, flush snapped file
3039 * metadata back to the MDS. If we dropped the last ref, call
3040 * ceph_check_caps.
3041 */
3042void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
3043 struct ceph_snap_context *snapc)
3044{
3045 struct inode *inode = &ci->vfs_inode;
Sage Weila8599bd2009-10-06 11:31:12 -07003046 struct ceph_cap_snap *capsnap = NULL;
Yan, Zheng70220ac2016-07-06 16:21:30 +08003047 int put = 0;
3048 bool last = false;
3049 bool found = false;
3050 bool flush_snaps = false;
3051 bool complete_capsnap = false;
Sage Weila8599bd2009-10-06 11:31:12 -07003052
Sage Weilbe655592011-11-30 09:47:09 -08003053 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07003054 ci->i_wrbuffer_ref -= nr;
Yan, Zheng70220ac2016-07-06 16:21:30 +08003055 if (ci->i_wrbuffer_ref == 0) {
3056 last = true;
3057 put++;
3058 }
Sage Weila8599bd2009-10-06 11:31:12 -07003059
3060 if (ci->i_head_snapc == snapc) {
3061 ci->i_wrbuffer_ref_head -= nr;
Sage Weil7d8cb262010-08-24 08:44:16 -07003062 if (ci->i_wrbuffer_ref_head == 0 &&
Yan, Zheng5dda377c2015-04-30 14:40:54 +08003063 ci->i_wr_ref == 0 &&
3064 ci->i_dirty_caps == 0 &&
3065 ci->i_flushing_caps == 0) {
Sage Weil7d8cb262010-08-24 08:44:16 -07003066 BUG_ON(!ci->i_head_snapc);
Sage Weila8599bd2009-10-06 11:31:12 -07003067 ceph_put_snap_context(ci->i_head_snapc);
3068 ci->i_head_snapc = NULL;
3069 }
3070 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
3071 inode,
3072 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
3073 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
3074 last ? " LAST" : "");
3075 } else {
3076 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
3077 if (capsnap->context == snapc) {
Yan, Zheng70220ac2016-07-06 16:21:30 +08003078 found = true;
Sage Weila8599bd2009-10-06 11:31:12 -07003079 break;
3080 }
3081 }
3082 BUG_ON(!found);
Sage Weil819ccbf2010-04-01 09:33:46 -07003083 capsnap->dirty_pages -= nr;
3084 if (capsnap->dirty_pages == 0) {
Yan, Zheng70220ac2016-07-06 16:21:30 +08003085 complete_capsnap = true;
3086 if (!capsnap->writing) {
3087 if (ceph_try_drop_cap_snap(ci, capsnap)) {
3088 put++;
3089 } else {
3090 ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
3091 flush_snaps = true;
3092 }
3093 }
Sage Weil819ccbf2010-04-01 09:33:46 -07003094 }
Sage Weila8599bd2009-10-06 11:31:12 -07003095 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
Yan, Zheng86056092015-05-01 16:57:16 +08003096 " snap %lld %d/%d -> %d/%d %s%s\n",
Sage Weila8599bd2009-10-06 11:31:12 -07003097 inode, capsnap, capsnap->context->seq,
3098 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
3099 ci->i_wrbuffer_ref, capsnap->dirty_pages,
3100 last ? " (wrbuffer last)" : "",
Yan, Zheng86056092015-05-01 16:57:16 +08003101 complete_capsnap ? " (complete capsnap)" : "");
Sage Weila8599bd2009-10-06 11:31:12 -07003102 }
3103
Sage Weilbe655592011-11-30 09:47:09 -08003104 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07003105
3106 if (last) {
Yan, Zhengbf73c622020-03-05 20:21:04 +08003107 ceph_check_caps(ci, 0, NULL);
Yan, Zheng70220ac2016-07-06 16:21:30 +08003108 } else if (flush_snaps) {
Yan, Zhenged9b4302016-07-05 21:08:07 +08003109 ceph_flush_snaps(ci, NULL);
Sage Weila8599bd2009-10-06 11:31:12 -07003110 }
Yan, Zheng70220ac2016-07-06 16:21:30 +08003111 if (complete_capsnap)
3112 wake_up_all(&ci->i_cap_wq);
Yan, Zheng3e1d0452019-05-18 20:39:55 +08003113 while (put-- > 0) {
3114 /* avoid calling iput_final() in osd dispatch threads */
3115 ceph_async_iput(inode);
3116 }
Sage Weila8599bd2009-10-06 11:31:12 -07003117}
3118
3119/*
Yan, Zhengca20c992013-07-21 10:07:51 +08003120 * Invalidate unlinked inode's aliases, so we can drop the inode ASAP.
3121 */
3122static void invalidate_aliases(struct inode *inode)
3123{
3124 struct dentry *dn, *prev = NULL;
3125
3126 dout("invalidate_aliases inode %p\n", inode);
3127 d_prune_aliases(inode);
3128 /*
3129 * For non-directory inode, d_find_alias() only returns
J. Bruce Fieldsfc12c802014-01-16 17:42:53 -05003130 * hashed dentry. After calling d_invalidate(), the
3131 * dentry becomes unhashed.
Yan, Zhengca20c992013-07-21 10:07:51 +08003132 *
Yan, Zhenga8d436f2013-09-02 15:19:54 +08003133 * For directory inode, d_find_alias() can return
J. Bruce Fieldsfc12c802014-01-16 17:42:53 -05003134 * unhashed dentry. But directory inode should have
Yan, Zhengca20c992013-07-21 10:07:51 +08003135 * one alias at most.
3136 */
3137 while ((dn = d_find_alias(inode))) {
3138 if (dn == prev) {
3139 dput(dn);
3140 break;
3141 }
Yan, Zhenga8d436f2013-09-02 15:19:54 +08003142 d_invalidate(dn);
Yan, Zhengca20c992013-07-21 10:07:51 +08003143 if (prev)
3144 dput(prev);
3145 prev = dn;
3146 }
3147 if (prev)
3148 dput(prev);
3149}
3150
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003151struct cap_extra_info {
3152 struct ceph_string *pool_ns;
3153 /* inline data */
3154 u64 inline_version;
3155 void *inline_data;
3156 u32 inline_len;
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003157 /* dirstat */
3158 bool dirstat_valid;
3159 u64 nfiles;
3160 u64 nsubdirs;
Jeff Layton176c77c2019-06-06 08:06:40 -04003161 u64 change_attr;
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003162 /* currently issued */
3163 int issued;
Jeff Laytonec62b892019-05-29 12:23:14 -04003164 struct timespec64 btime;
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003165};
3166
Yan, Zhengca20c992013-07-21 10:07:51 +08003167/*
Sage Weila8599bd2009-10-06 11:31:12 -07003168 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
3169 * actually be a revocation if it specifies a smaller cap set.)
3170 *
Sage Weilbe655592011-11-30 09:47:09 -08003171 * caller holds s_mutex and i_ceph_lock, we drop both.
Sage Weila8599bd2009-10-06 11:31:12 -07003172 */
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003173static void handle_cap_grant(struct inode *inode,
Sage Weil15637c82010-03-16 13:42:00 -07003174 struct ceph_mds_session *session,
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003175 struct ceph_cap *cap,
3176 struct ceph_mds_caps *grant,
3177 struct ceph_buffer *xattr_buf,
3178 struct cap_extra_info *extra_info)
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003179 __releases(ci->i_ceph_lock)
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003180 __releases(session->s_mdsc->snap_rwsem)
Sage Weila8599bd2009-10-06 11:31:12 -07003181{
3182 struct ceph_inode_info *ci = ceph_inode(inode);
Sage Weil2f56f562010-10-27 20:59:49 -07003183 int seq = le32_to_cpu(grant->seq);
Sage Weila8599bd2009-10-06 11:31:12 -07003184 int newcaps = le32_to_cpu(grant->caps);
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003185 int used, wanted, dirty;
Sage Weila8599bd2009-10-06 11:31:12 -07003186 u64 size = le64_to_cpu(grant->size);
3187 u64 max_size = le64_to_cpu(grant->max_size);
Yan, Zhengfdac94f2018-11-22 15:26:01 +08003188 unsigned char check_caps = 0;
3189 bool was_stale = cap->cap_gen < session->s_cap_gen;
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003190 bool wake = false;
3191 bool writeback = false;
3192 bool queue_trunc = false;
3193 bool queue_invalidate = false;
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003194 bool deleted_inode = false;
Yan, Zheng31c542a2014-11-14 21:41:55 +08003195 bool fill_inline = false;
Sage Weila8599bd2009-10-06 11:31:12 -07003196
Sage Weil2f56f562010-10-27 20:59:49 -07003197 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003198 inode, cap, session->s_mds, seq, ceph_cap_string(newcaps));
Sage Weila8599bd2009-10-06 11:31:12 -07003199 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
3200 inode->i_size);
3201
Yan, Zheng11df2df2013-11-24 14:44:38 +08003202
3203 /*
Sage Weila8599bd2009-10-06 11:31:12 -07003204 * If CACHE is being revoked, and we have no dirty buffers,
3205 * try to invalidate (once). (If there are dirty buffers, we
3206 * will invalidate _after_ writeback.)
3207 */
Yan, Zheng525d15e2019-05-11 17:27:59 +08003208 if (S_ISREG(inode->i_mode) && /* don't invalidate readdir cache */
Yan, Zhengfdd4e152015-06-16 20:48:56 +08003209 ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
Sage Weil3b454c42010-06-10 13:20:33 -07003210 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
Yan, Zheng9abd4db2016-05-18 20:58:26 +08003211 !(ci->i_wrbuffer_ref || ci->i_wb_ref)) {
Li Wange9075742013-08-15 22:00:25 -07003212 if (try_nonblocking_invalidate(inode)) {
Sage Weila8599bd2009-10-06 11:31:12 -07003213 /* there were locked pages.. invalidate later
3214 in a separate thread. */
3215 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003216 queue_invalidate = true;
Sage Weila8599bd2009-10-06 11:31:12 -07003217 ci->i_rdcache_revoking = ci->i_rdcache_gen;
3218 }
Sage Weila8599bd2009-10-06 11:31:12 -07003219 }
Sage Weila8599bd2009-10-06 11:31:12 -07003220 }
3221
Yan, Zhengd2f8bb22018-12-10 16:35:09 +08003222 if (was_stale)
3223 cap->issued = cap->implemented = CEPH_CAP_PIN;
3224
3225 /*
3226 * auth mds of the inode changed. we received the cap export message,
3227 * but still haven't received the cap import message. handle_cap_export
3228 * updated the new auth MDS' cap.
3229 *
3230 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing a message
3231 * that was sent before the cap import message. So don't remove caps.
3232 */
3233 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3234 WARN_ON(cap != ci->i_auth_cap);
3235 WARN_ON(cap->cap_id != le64_to_cpu(grant->cap_id));
3236 seq = cap->seq;
3237 newcaps |= cap->issued;
3238 }
3239
Sage Weila8599bd2009-10-06 11:31:12 -07003240 /* side effects now are allowed */
Sage Weil685f9a5d2009-11-09 12:05:48 -08003241 cap->cap_gen = session->s_cap_gen;
Yan, Zheng11df2df2013-11-24 14:44:38 +08003242 cap->seq = seq;
Sage Weila8599bd2009-10-06 11:31:12 -07003243
3244 __check_cap_issue(ci, cap, newcaps);
3245
Jeff Layton176c77c2019-06-06 08:06:40 -04003246 inode_set_max_iversion_raw(inode, extra_info->change_attr);
3247
Yan, Zhengf98a1282014-04-17 08:55:50 +08003248 if ((newcaps & CEPH_CAP_AUTH_SHARED) &&
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003249 (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) {
Sage Weila8599bd2009-10-06 11:31:12 -07003250 inode->i_mode = le32_to_cpu(grant->mode);
Eric W. Biederman05cb11c2013-01-31 02:56:19 -08003251 inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid));
3252 inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid));
Jeff Laytonec62b892019-05-29 12:23:14 -04003253 ci->i_btime = extra_info->btime;
Sage Weila8599bd2009-10-06 11:31:12 -07003254 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
Eric W. Biedermanbd2bae62013-01-31 04:05:39 -08003255 from_kuid(&init_user_ns, inode->i_uid),
3256 from_kgid(&init_user_ns, inode->i_gid));
Sage Weila8599bd2009-10-06 11:31:12 -07003257 }
3258
Yan, Zhengfa466742018-05-25 11:22:56 +08003259 if ((newcaps & CEPH_CAP_LINK_SHARED) &&
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003260 (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) {
Miklos Szeredibfe86842011-10-28 14:13:29 +02003261 set_nlink(inode, le32_to_cpu(grant->nlink));
Yan, Zhengca20c992013-07-21 10:07:51 +08003262 if (inode->i_nlink == 0 &&
3263 (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003264 deleted_inode = true;
Yan, Zhengca20c992013-07-21 10:07:51 +08003265 }
Sage Weila8599bd2009-10-06 11:31:12 -07003266
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003267 if ((extra_info->issued & CEPH_CAP_XATTR_EXCL) == 0 &&
3268 grant->xattr_len) {
Sage Weila8599bd2009-10-06 11:31:12 -07003269 int len = le32_to_cpu(grant->xattr_len);
3270 u64 version = le64_to_cpu(grant->xattr_version);
3271
3272 if (version > ci->i_xattrs.version) {
3273 dout(" got new xattrs v%llu on %p len %d\n",
3274 version, inode, len);
3275 if (ci->i_xattrs.blob)
3276 ceph_buffer_put(ci->i_xattrs.blob);
3277 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
3278 ci->i_xattrs.version = version;
Guangliang Zhao7221fe42013-11-11 15:18:03 +08003279 ceph_forget_all_cached_acls(inode);
Yan, Zhengac6713c2019-05-26 16:27:56 +08003280 ceph_security_invalidate_secctx(inode);
Sage Weila8599bd2009-10-06 11:31:12 -07003281 }
3282 }
3283
Yan, Zhengf98a1282014-04-17 08:55:50 +08003284 if (newcaps & CEPH_CAP_ANY_RD) {
Arnd Bergmann9bbeab42018-07-13 22:18:36 +02003285 struct timespec64 mtime, atime, ctime;
Yan, Zhengf98a1282014-04-17 08:55:50 +08003286 /* ctime/mtime/atime? */
Arnd Bergmann9bbeab42018-07-13 22:18:36 +02003287 ceph_decode_timespec64(&mtime, &grant->mtime);
3288 ceph_decode_timespec64(&atime, &grant->atime);
3289 ceph_decode_timespec64(&ctime, &grant->ctime);
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003290 ceph_fill_file_time(inode, extra_info->issued,
Yan, Zhengf98a1282014-04-17 08:55:50 +08003291 le32_to_cpu(grant->time_warp_seq),
3292 &ctime, &mtime, &atime);
3293 }
Sage Weila8599bd2009-10-06 11:31:12 -07003294
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003295 if ((newcaps & CEPH_CAP_FILE_SHARED) && extra_info->dirstat_valid) {
3296 ci->i_files = extra_info->nfiles;
3297 ci->i_subdirs = extra_info->nsubdirs;
3298 }
3299
Yan, Zhengf98a1282014-04-17 08:55:50 +08003300 if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
3301 /* file layout may have changed */
Yan, Zheng76271512016-02-03 21:24:49 +08003302 s64 old_pool = ci->i_layout.pool_id;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003303 struct ceph_string *old_ns;
3304
Yan, Zheng76271512016-02-03 21:24:49 +08003305 ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003306 old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
3307 lockdep_is_held(&ci->i_ceph_lock));
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003308 rcu_assign_pointer(ci->i_layout.pool_ns, extra_info->pool_ns);
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003309
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003310 if (ci->i_layout.pool_id != old_pool ||
3311 extra_info->pool_ns != old_ns)
Yan, Zheng76271512016-02-03 21:24:49 +08003312 ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
Yan, Zheng5ea5c5e2016-02-14 18:06:41 +08003313
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003314 extra_info->pool_ns = old_ns;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003315
Yan, Zhengf98a1282014-04-17 08:55:50 +08003316 /* size/truncate_seq? */
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003317 queue_trunc = ceph_fill_file_size(inode, extra_info->issued,
Yan, Zhengf98a1282014-04-17 08:55:50 +08003318 le32_to_cpu(grant->truncate_seq),
3319 le64_to_cpu(grant->truncate_size),
3320 size);
Yan, Zheng84eea8c2017-05-16 08:55:34 +08003321 }
3322
3323 if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
3324 if (max_size != ci->i_max_size) {
Yan, Zhengf98a1282014-04-17 08:55:50 +08003325 dout("max_size %lld -> %llu\n",
3326 ci->i_max_size, max_size);
3327 ci->i_max_size = max_size;
3328 if (max_size >= ci->i_wanted_max_size) {
3329 ci->i_wanted_max_size = 0; /* reset */
3330 ci->i_requested_max_size = 0;
3331 }
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003332 wake = true;
Yan, Zheng84eea8c2017-05-16 08:55:34 +08003333 } else if (ci->i_wanted_max_size > ci->i_max_size &&
3334 ci->i_wanted_max_size > ci->i_requested_max_size) {
3335 /* CEPH_CAP_OP_IMPORT */
3336 wake = true;
Sage Weila8599bd2009-10-06 11:31:12 -07003337 }
Sage Weila8599bd2009-10-06 11:31:12 -07003338 }
3339
3340 /* check cap bits */
3341 wanted = __ceph_caps_wanted(ci);
3342 used = __ceph_caps_used(ci);
3343 dirty = __ceph_caps_dirty(ci);
3344 dout(" my wanted = %s, used = %s, dirty %s\n",
3345 ceph_cap_string(wanted),
3346 ceph_cap_string(used),
3347 ceph_cap_string(dirty));
Yan, Zhengfdac94f2018-11-22 15:26:01 +08003348
3349 if ((was_stale || le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) &&
3350 (wanted & ~(cap->mds_wanted | newcaps))) {
3351 /*
3352 * If mds is importing cap, prior cap messages that update
3353 * 'wanted' may get dropped by mds (migrate seq mismatch).
3354 *
3355 * We don't send cap message to update 'wanted' if what we
3356 * want are already issued. If mds revokes caps, cap message
3357 * that releases caps also tells mds what we want. But if
3358 * caps got revoked by mds forcedly (session stale). We may
3359 * haven't told mds what we want.
3360 */
3361 check_caps = 1;
Sage Weila8599bd2009-10-06 11:31:12 -07003362 }
3363
Sage Weila8599bd2009-10-06 11:31:12 -07003364 /* revocation, grant, or no-op? */
3365 if (cap->issued & ~newcaps) {
Sage Weil3b454c42010-06-10 13:20:33 -07003366 int revoking = cap->issued & ~newcaps;
3367
3368 dout("revocation: %s -> %s (revoking %s)\n",
3369 ceph_cap_string(cap->issued),
3370 ceph_cap_string(newcaps),
3371 ceph_cap_string(revoking));
Yan, Zheng525d15e2019-05-11 17:27:59 +08003372 if (S_ISREG(inode->i_mode) &&
3373 (revoking & used & CEPH_CAP_FILE_BUFFER))
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003374 writeback = true; /* initiate writeback; will delay ack */
Yan, Zheng525d15e2019-05-11 17:27:59 +08003375 else if (queue_invalidate &&
3376 revoking == CEPH_CAP_FILE_CACHE &&
3377 (newcaps & CEPH_CAP_FILE_LAZYIO) == 0)
Sage Weil3b454c42010-06-10 13:20:33 -07003378 ; /* do nothing yet, invalidation will be queued */
3379 else if (cap == ci->i_auth_cap)
3380 check_caps = 1; /* check auth cap only */
3381 else
3382 check_caps = 2; /* check all caps */
Sage Weila8599bd2009-10-06 11:31:12 -07003383 cap->issued = newcaps;
Sage Weil978097c2010-03-08 15:27:53 -08003384 cap->implemented |= newcaps;
Sage Weila8599bd2009-10-06 11:31:12 -07003385 } else if (cap->issued == newcaps) {
3386 dout("caps unchanged: %s -> %s\n",
3387 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
3388 } else {
3389 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
3390 ceph_cap_string(newcaps));
Yan, Zheng6ee6b9532013-07-02 12:40:21 +08003391 /* non-auth MDS is revoking the newly grant caps ? */
3392 if (cap == ci->i_auth_cap &&
3393 __ceph_caps_revoking_other(ci, cap, newcaps))
3394 check_caps = 2;
3395
Sage Weila8599bd2009-10-06 11:31:12 -07003396 cap->issued = newcaps;
3397 cap->implemented |= newcaps; /* add bits only, to
3398 * avoid stepping on a
3399 * pending revocation */
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003400 wake = true;
Sage Weila8599bd2009-10-06 11:31:12 -07003401 }
Sage Weil978097c2010-03-08 15:27:53 -08003402 BUG_ON(cap->issued & ~cap->implemented);
Sage Weila8599bd2009-10-06 11:31:12 -07003403
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003404 if (extra_info->inline_version > 0 &&
3405 extra_info->inline_version >= ci->i_inline_version) {
3406 ci->i_inline_version = extra_info->inline_version;
Yan, Zheng31c542a2014-11-14 21:41:55 +08003407 if (ci->i_inline_version != CEPH_INLINE_NONE &&
3408 (newcaps & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)))
3409 fill_inline = true;
3410 }
3411
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003412 if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003413 if (newcaps & ~extra_info->issued)
Fabian Frederickab6c2c32014-10-09 23:16:35 +02003414 wake = true;
Jeff Laytone8a4d262020-02-25 11:49:53 -08003415 ceph_kick_flushing_inode_caps(session, ci);
3416 spin_unlock(&ci->i_ceph_lock);
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003417 up_read(&session->s_mdsc->snap_rwsem);
Yan, Zheng0e294382016-07-04 18:06:41 +08003418 } else {
3419 spin_unlock(&ci->i_ceph_lock);
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003420 }
3421
Yan, Zheng31c542a2014-11-14 21:41:55 +08003422 if (fill_inline)
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003423 ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
3424 extra_info->inline_len);
Yan, Zheng31c542a2014-11-14 21:41:55 +08003425
Yan, Zheng14649752016-05-20 15:41:20 +08003426 if (queue_trunc)
Yan, Zhengc6bcda62014-04-11 10:18:07 +08003427 ceph_queue_vmtruncate(inode);
Yan, Zhengc6bcda62014-04-11 10:18:07 +08003428
Sage Weil3c6f6b72010-02-09 15:24:44 -08003429 if (writeback)
Sage Weila8599bd2009-10-06 11:31:12 -07003430 /*
3431 * queue inode for writeback: we can't actually call
3432 * filemap_write_and_wait, etc. from message handler
3433 * context.
3434 */
Sage Weil3c6f6b72010-02-09 15:24:44 -08003435 ceph_queue_writeback(inode);
3436 if (queue_invalidate)
3437 ceph_queue_invalidate(inode);
Yan, Zhengca20c992013-07-21 10:07:51 +08003438 if (deleted_inode)
3439 invalidate_aliases(inode);
Sage Weila8599bd2009-10-06 11:31:12 -07003440 if (wake)
Yehuda Sadeh03066f22010-07-27 13:11:08 -07003441 wake_up_all(&ci->i_cap_wq);
Sage Weil15637c82010-03-16 13:42:00 -07003442
3443 if (check_caps == 1)
Yan, Zhenga0d93e32020-03-05 20:21:01 +08003444 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY | CHECK_CAPS_NOINVAL,
Sage Weil15637c82010-03-16 13:42:00 -07003445 session);
3446 else if (check_caps == 2)
Yan, Zhenga0d93e32020-03-05 20:21:01 +08003447 ceph_check_caps(ci, CHECK_CAPS_NOINVAL, session);
Sage Weil15637c82010-03-16 13:42:00 -07003448 else
3449 mutex_unlock(&session->s_mutex);
Sage Weila8599bd2009-10-06 11:31:12 -07003450}
3451
3452/*
3453 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
3454 * MDS has been safely committed.
3455 */
Sage Weil6df058c2009-12-22 11:24:33 -08003456static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
Sage Weila8599bd2009-10-06 11:31:12 -07003457 struct ceph_mds_caps *m,
3458 struct ceph_mds_session *session,
3459 struct ceph_cap *cap)
Sage Weilbe655592011-11-30 09:47:09 -08003460 __releases(ci->i_ceph_lock)
Sage Weila8599bd2009-10-06 11:31:12 -07003461{
3462 struct ceph_inode_info *ci = ceph_inode(inode);
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07003463 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
Yan, Zhenge4500b52016-07-06 11:12:56 +08003464 struct ceph_cap_flush *cf, *tmp_cf;
Yan, Zheng553adfd2015-06-09 15:48:57 +08003465 LIST_HEAD(to_remove);
Sage Weila8599bd2009-10-06 11:31:12 -07003466 unsigned seq = le32_to_cpu(m->seq);
3467 int dirty = le32_to_cpu(m->dirty);
3468 int cleaned = 0;
Yan, Zhengc8799fc2016-07-07 15:22:38 +08003469 bool drop = false;
Thomas Meyer7271efa2017-10-07 16:02:21 +02003470 bool wake_ci = false;
3471 bool wake_mdsc = false;
Sage Weila8599bd2009-10-06 11:31:12 -07003472
Yan, Zhenge4500b52016-07-06 11:12:56 +08003473 list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
Jeff Laytond7dbfb42020-03-18 15:34:20 -04003474 /* Is this the one that was flushed? */
Yan, Zheng553adfd2015-06-09 15:48:57 +08003475 if (cf->tid == flush_tid)
3476 cleaned = cf->caps;
Jeff Laytond7dbfb42020-03-18 15:34:20 -04003477
3478 /* Is this a capsnap? */
3479 if (cf->caps == 0)
Yan, Zheng0e294382016-07-04 18:06:41 +08003480 continue;
Jeff Laytond7dbfb42020-03-18 15:34:20 -04003481
Yan, Zheng553adfd2015-06-09 15:48:57 +08003482 if (cf->tid <= flush_tid) {
Jeff Laytond7dbfb42020-03-18 15:34:20 -04003483 /*
3484 * An earlier or current tid. The FLUSH_ACK should
3485 * represent a superset of this flush's caps.
3486 */
Jeff Layton681ac632020-03-18 15:29:34 -04003487 wake_ci |= __detach_cap_flush_from_ci(ci, cf);
Yan, Zhenge4500b52016-07-06 11:12:56 +08003488 list_add_tail(&cf->i_list, &to_remove);
Yan, Zheng553adfd2015-06-09 15:48:57 +08003489 } else {
Jeff Laytond7dbfb42020-03-18 15:34:20 -04003490 /*
3491 * This is a later one. Any caps in it are still dirty
3492 * so don't count them as cleaned.
3493 */
Yan, Zheng553adfd2015-06-09 15:48:57 +08003494 cleaned &= ~cf->caps;
3495 if (!cleaned)
3496 break;
3497 }
3498 }
Sage Weila8599bd2009-10-06 11:31:12 -07003499
3500 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
3501 " flushing %s -> %s\n",
3502 inode, session->s_mds, seq, ceph_cap_string(dirty),
3503 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
3504 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
3505
Yan, Zheng8310b082015-06-09 17:20:12 +08003506 if (list_empty(&to_remove) && !cleaned)
Sage Weila8599bd2009-10-06 11:31:12 -07003507 goto out;
3508
Sage Weila8599bd2009-10-06 11:31:12 -07003509 ci->i_flushing_caps &= ~cleaned;
Sage Weila8599bd2009-10-06 11:31:12 -07003510
3511 spin_lock(&mdsc->cap_dirty_lock);
Yan, Zheng8310b082015-06-09 17:20:12 +08003512
Jeff Layton681ac632020-03-18 15:29:34 -04003513 list_for_each_entry(cf, &to_remove, i_list)
3514 wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc, cf);
Yan, Zheng8310b082015-06-09 17:20:12 +08003515
Sage Weila8599bd2009-10-06 11:31:12 -07003516 if (ci->i_flushing_caps == 0) {
Yan, Zheng0e294382016-07-04 18:06:41 +08003517 if (list_empty(&ci->i_cap_flush_list)) {
3518 list_del_init(&ci->i_flushing_item);
3519 if (!list_empty(&session->s_cap_flushing)) {
3520 dout(" mds%d still flushing cap on %p\n",
3521 session->s_mds,
3522 &list_first_entry(&session->s_cap_flushing,
3523 struct ceph_inode_info,
3524 i_flushing_item)->vfs_inode);
3525 }
3526 }
Sage Weila8599bd2009-10-06 11:31:12 -07003527 mdsc->num_cap_flushing--;
Sage Weila8599bd2009-10-06 11:31:12 -07003528 dout(" inode %p now !flushing\n", inode);
Sage Weilafcdaea2009-10-14 14:27:38 -07003529
3530 if (ci->i_dirty_caps == 0) {
3531 dout(" inode %p now clean\n", inode);
3532 BUG_ON(!list_empty(&ci->i_dirty_item));
Yan, Zhengc8799fc2016-07-07 15:22:38 +08003533 drop = true;
Yan, Zheng5dda377c2015-04-30 14:40:54 +08003534 if (ci->i_wr_ref == 0 &&
3535 ci->i_wrbuffer_ref_head == 0) {
Sage Weil7d8cb262010-08-24 08:44:16 -07003536 BUG_ON(!ci->i_head_snapc);
3537 ceph_put_snap_context(ci->i_head_snapc);
3538 ci->i_head_snapc = NULL;
3539 }
Sage Weil76e3b392009-10-15 18:13:53 -07003540 } else {
3541 BUG_ON(list_empty(&ci->i_dirty_item));
Sage Weilafcdaea2009-10-14 14:27:38 -07003542 }
Sage Weila8599bd2009-10-06 11:31:12 -07003543 }
3544 spin_unlock(&mdsc->cap_dirty_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07003545
3546out:
Sage Weilbe655592011-11-30 09:47:09 -08003547 spin_unlock(&ci->i_ceph_lock);
Yan, Zheng553adfd2015-06-09 15:48:57 +08003548
3549 while (!list_empty(&to_remove)) {
3550 cf = list_first_entry(&to_remove,
Yan, Zhenge4500b52016-07-06 11:12:56 +08003551 struct ceph_cap_flush, i_list);
3552 list_del(&cf->i_list);
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08003553 ceph_free_cap_flush(cf);
Yan, Zheng553adfd2015-06-09 15:48:57 +08003554 }
Yan, Zhengc8799fc2016-07-07 15:22:38 +08003555
3556 if (wake_ci)
3557 wake_up_all(&ci->i_cap_wq);
3558 if (wake_mdsc)
3559 wake_up_all(&mdsc->cap_flushing_wq);
Sage Weilafcdaea2009-10-14 14:27:38 -07003560 if (drop)
Sage Weila8599bd2009-10-06 11:31:12 -07003561 iput(inode);
3562}
3563
3564/*
3565 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
3566 * throw away our cap_snap.
3567 *
3568 * Caller hold s_mutex.
3569 */
Sage Weil6df058c2009-12-22 11:24:33 -08003570static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
Sage Weila8599bd2009-10-06 11:31:12 -07003571 struct ceph_mds_caps *m,
3572 struct ceph_mds_session *session)
3573{
3574 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zhengaffbc192015-05-05 21:22:13 +08003575 struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07003576 u64 follows = le64_to_cpu(m->snap_follows);
Sage Weila8599bd2009-10-06 11:31:12 -07003577 struct ceph_cap_snap *capsnap;
Yan, Zhengc8799fc2016-07-07 15:22:38 +08003578 bool flushed = false;
3579 bool wake_ci = false;
3580 bool wake_mdsc = false;
Sage Weila8599bd2009-10-06 11:31:12 -07003581
3582 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
3583 inode, ci, session->s_mds, follows);
3584
Sage Weilbe655592011-11-30 09:47:09 -08003585 spin_lock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07003586 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
3587 if (capsnap->follows == follows) {
Yan, Zheng0e294382016-07-04 18:06:41 +08003588 if (capsnap->cap_flush.tid != flush_tid) {
Sage Weila8599bd2009-10-06 11:31:12 -07003589 dout(" cap_snap %p follows %lld tid %lld !="
3590 " %lld\n", capsnap, follows,
Yan, Zheng0e294382016-07-04 18:06:41 +08003591 flush_tid, capsnap->cap_flush.tid);
Sage Weila8599bd2009-10-06 11:31:12 -07003592 break;
3593 }
Yan, Zhengc8799fc2016-07-07 15:22:38 +08003594 flushed = true;
Sage Weila8599bd2009-10-06 11:31:12 -07003595 break;
3596 } else {
3597 dout(" skipping cap_snap %p follows %lld\n",
3598 capsnap, capsnap->follows);
3599 }
3600 }
Yan, Zheng0e294382016-07-04 18:06:41 +08003601 if (flushed) {
Yan, Zheng0e294382016-07-04 18:06:41 +08003602 WARN_ON(capsnap->dirty_pages || capsnap->writing);
3603 dout(" removing %p cap_snap %p follows %lld\n",
3604 inode, capsnap, follows);
3605 list_del(&capsnap->ci_item);
Jeff Layton681ac632020-03-18 15:29:34 -04003606 wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
Yan, Zheng0e294382016-07-04 18:06:41 +08003607
3608 spin_lock(&mdsc->cap_dirty_lock);
3609
3610 if (list_empty(&ci->i_cap_flush_list))
3611 list_del_init(&ci->i_flushing_item);
3612
Jeff Layton681ac632020-03-18 15:29:34 -04003613 wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc,
3614 &capsnap->cap_flush);
Yan, Zheng0e294382016-07-04 18:06:41 +08003615 spin_unlock(&mdsc->cap_dirty_lock);
Yan, Zheng0e294382016-07-04 18:06:41 +08003616 }
Sage Weilbe655592011-11-30 09:47:09 -08003617 spin_unlock(&ci->i_ceph_lock);
Yan, Zheng0e294382016-07-04 18:06:41 +08003618 if (flushed) {
3619 ceph_put_snap_context(capsnap->context);
3620 ceph_put_cap_snap(capsnap);
Yan, Zhengc8799fc2016-07-07 15:22:38 +08003621 if (wake_ci)
3622 wake_up_all(&ci->i_cap_wq);
3623 if (wake_mdsc)
3624 wake_up_all(&mdsc->cap_flushing_wq);
Sage Weila8599bd2009-10-06 11:31:12 -07003625 iput(inode);
Yan, Zheng0e294382016-07-04 18:06:41 +08003626 }
Sage Weila8599bd2009-10-06 11:31:12 -07003627}
3628
3629/*
3630 * Handle TRUNC from MDS, indicating file truncation.
3631 *
3632 * caller hold s_mutex.
3633 */
Jeff Layton7391fba2020-03-18 16:43:30 -04003634static bool handle_cap_trunc(struct inode *inode,
Sage Weila8599bd2009-10-06 11:31:12 -07003635 struct ceph_mds_caps *trunc,
3636 struct ceph_mds_session *session)
Sage Weila8599bd2009-10-06 11:31:12 -07003637{
3638 struct ceph_inode_info *ci = ceph_inode(inode);
3639 int mds = session->s_mds;
3640 int seq = le32_to_cpu(trunc->seq);
3641 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
3642 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
3643 u64 size = le64_to_cpu(trunc->size);
3644 int implemented = 0;
3645 int dirty = __ceph_caps_dirty(ci);
3646 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
Jeff Layton7391fba2020-03-18 16:43:30 -04003647 bool queue_trunc = false;
3648
3649 lockdep_assert_held(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07003650
3651 issued |= implemented | dirty;
3652
3653 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
3654 inode, mds, seq, truncate_size, truncate_seq);
3655 queue_trunc = ceph_fill_file_size(inode, issued,
3656 truncate_seq, truncate_size, size);
Jeff Layton7391fba2020-03-18 16:43:30 -04003657 return queue_trunc;
Sage Weila8599bd2009-10-06 11:31:12 -07003658}
3659
3660/*
3661 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
3662 * different one. If we are the most recent migration we've seen (as
3663 * indicated by mseq), make note of the migrating cap bits for the
3664 * duration (until we see the corresponding IMPORT).
3665 *
3666 * caller holds s_mutex
3667 */
3668static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
Yan, Zheng11df2df2013-11-24 14:44:38 +08003669 struct ceph_mds_cap_peer *ph,
3670 struct ceph_mds_session *session)
Sage Weila8599bd2009-10-06 11:31:12 -07003671{
Sage Weildb354052011-05-24 11:46:31 -07003672 struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
Yan, Zheng11df2df2013-11-24 14:44:38 +08003673 struct ceph_mds_session *tsession = NULL;
Yan, Zhengd9df2782014-04-18 09:57:11 +08003674 struct ceph_cap *cap, *tcap, *new_cap = NULL;
Sage Weila8599bd2009-10-06 11:31:12 -07003675 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003676 u64 t_cap_id;
Sage Weila8599bd2009-10-06 11:31:12 -07003677 unsigned mseq = le32_to_cpu(ex->migrate_seq);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003678 unsigned t_seq, t_mseq;
3679 int target, issued;
3680 int mds = session->s_mds;
Sage Weila8599bd2009-10-06 11:31:12 -07003681
Yan, Zheng11df2df2013-11-24 14:44:38 +08003682 if (ph) {
3683 t_cap_id = le64_to_cpu(ph->cap_id);
3684 t_seq = le32_to_cpu(ph->seq);
3685 t_mseq = le32_to_cpu(ph->mseq);
3686 target = le32_to_cpu(ph->mds);
3687 } else {
3688 t_cap_id = t_seq = t_mseq = 0;
3689 target = -1;
Sage Weila8599bd2009-10-06 11:31:12 -07003690 }
3691
Yan, Zheng11df2df2013-11-24 14:44:38 +08003692 dout("handle_cap_export inode %p ci %p mds%d mseq %d target %d\n",
3693 inode, ci, mds, mseq, target);
3694retry:
3695 spin_lock(&ci->i_ceph_lock);
3696 cap = __get_cap_for_mds(ci, mds);
Yan, Zhengca665e02014-04-21 15:46:37 +08003697 if (!cap || cap->cap_id != le64_to_cpu(ex->cap_id))
Yan, Zheng11df2df2013-11-24 14:44:38 +08003698 goto out_unlock;
Sage Weil154f42c2010-06-21 13:45:04 -07003699
Yan, Zheng11df2df2013-11-24 14:44:38 +08003700 if (target < 0) {
Yan, Zhengd2f8bb22018-12-10 16:35:09 +08003701 __ceph_remove_cap(cap, false);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003702 goto out_unlock;
3703 }
Sage Weildb354052011-05-24 11:46:31 -07003704
Yan, Zheng11df2df2013-11-24 14:44:38 +08003705 /*
3706 * now we know we haven't received the cap import message yet
3707 * because the exported cap still exist.
3708 */
3709
3710 issued = cap->issued;
Yan, Zhengd84b37f2018-01-03 11:16:27 +08003711 if (issued != cap->implemented)
3712 pr_err_ratelimited("handle_cap_export: issued != implemented: "
3713 "ino (%llx.%llx) mds%d seq %d mseq %d "
3714 "issued %s implemented %s\n",
3715 ceph_vinop(inode), mds, cap->seq, cap->mseq,
3716 ceph_cap_string(issued),
3717 ceph_cap_string(cap->implemented));
3718
Yan, Zheng11df2df2013-11-24 14:44:38 +08003719
3720 tcap = __get_cap_for_mds(ci, target);
3721 if (tcap) {
3722 /* already have caps from the target */
Yan, Zhengfa0aa3b2017-08-28 15:07:42 +08003723 if (tcap->cap_id == t_cap_id &&
Yan, Zheng11df2df2013-11-24 14:44:38 +08003724 ceph_seq_cmp(tcap->seq, t_seq) < 0) {
3725 dout(" updating import cap %p mds%d\n", tcap, target);
3726 tcap->cap_id = t_cap_id;
3727 tcap->seq = t_seq - 1;
3728 tcap->issue_seq = t_seq - 1;
Yan, Zheng11df2df2013-11-24 14:44:38 +08003729 tcap->issued |= issued;
3730 tcap->implemented |= issued;
3731 if (cap == ci->i_auth_cap)
3732 ci->i_auth_cap = tcap;
Yan, Zheng00f06cb2017-01-24 10:02:32 +08003733
Yan, Zheng0e294382016-07-04 18:06:41 +08003734 if (!list_empty(&ci->i_cap_flush_list) &&
3735 ci->i_auth_cap == tcap) {
Yan, Zheng11df2df2013-11-24 14:44:38 +08003736 spin_lock(&mdsc->cap_dirty_lock);
3737 list_move_tail(&ci->i_flushing_item,
3738 &tcap->session->s_cap_flushing);
3739 spin_unlock(&mdsc->cap_dirty_lock);
Sage Weildb354052011-05-24 11:46:31 -07003740 }
Sage Weila8599bd2009-10-06 11:31:12 -07003741 }
Yan, Zhenga096b092013-09-22 10:15:58 +08003742 __ceph_remove_cap(cap, false);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003743 goto out_unlock;
Yan, Zhengd9df2782014-04-18 09:57:11 +08003744 } else if (tsession) {
Yan, Zheng11df2df2013-11-24 14:44:38 +08003745 /* add placeholder for the export tagert */
Yan, Zhengd9df2782014-04-18 09:57:11 +08003746 int flag = (cap == ci->i_auth_cap) ? CEPH_CAP_FLAG_AUTH : 0;
Yan, Zheng00f06cb2017-01-24 10:02:32 +08003747 tcap = new_cap;
Yan, Zheng135e6712020-03-05 20:21:02 +08003748 ceph_add_cap(inode, tsession, t_cap_id, issued, 0,
Yan, Zhengd9df2782014-04-18 09:57:11 +08003749 t_seq - 1, t_mseq, (u64)-1, flag, &new_cap);
3750
Yan, Zheng00f06cb2017-01-24 10:02:32 +08003751 if (!list_empty(&ci->i_cap_flush_list) &&
3752 ci->i_auth_cap == tcap) {
3753 spin_lock(&mdsc->cap_dirty_lock);
3754 list_move_tail(&ci->i_flushing_item,
3755 &tcap->session->s_cap_flushing);
3756 spin_unlock(&mdsc->cap_dirty_lock);
3757 }
3758
Yan, Zhengd9df2782014-04-18 09:57:11 +08003759 __ceph_remove_cap(cap, false);
3760 goto out_unlock;
Yan, Zheng11df2df2013-11-24 14:44:38 +08003761 }
Sage Weila8599bd2009-10-06 11:31:12 -07003762
Sage Weilbe655592011-11-30 09:47:09 -08003763 spin_unlock(&ci->i_ceph_lock);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003764 mutex_unlock(&session->s_mutex);
3765
3766 /* open target session */
3767 tsession = ceph_mdsc_open_export_target_session(mdsc, target);
3768 if (!IS_ERR(tsession)) {
3769 if (mds > target) {
3770 mutex_lock(&session->s_mutex);
3771 mutex_lock_nested(&tsession->s_mutex,
3772 SINGLE_DEPTH_NESTING);
3773 } else {
3774 mutex_lock(&tsession->s_mutex);
3775 mutex_lock_nested(&session->s_mutex,
3776 SINGLE_DEPTH_NESTING);
3777 }
Yan, Zhengd9df2782014-04-18 09:57:11 +08003778 new_cap = ceph_get_cap(mdsc, NULL);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003779 } else {
3780 WARN_ON(1);
3781 tsession = NULL;
3782 target = -1;
Wu Bo4d8e28f2020-04-30 14:12:49 +08003783 mutex_lock(&session->s_mutex);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003784 }
3785 goto retry;
3786
3787out_unlock:
3788 spin_unlock(&ci->i_ceph_lock);
3789 mutex_unlock(&session->s_mutex);
3790 if (tsession) {
3791 mutex_unlock(&tsession->s_mutex);
3792 ceph_put_mds_session(tsession);
3793 }
Yan, Zhengd9df2782014-04-18 09:57:11 +08003794 if (new_cap)
3795 ceph_put_cap(mdsc, new_cap);
Sage Weila8599bd2009-10-06 11:31:12 -07003796}
3797
3798/*
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003799 * Handle cap IMPORT.
Sage Weila8599bd2009-10-06 11:31:12 -07003800 *
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003801 * caller holds s_mutex. acquires i_ceph_lock
Sage Weila8599bd2009-10-06 11:31:12 -07003802 */
3803static void handle_cap_import(struct ceph_mds_client *mdsc,
3804 struct inode *inode, struct ceph_mds_caps *im,
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003805 struct ceph_mds_cap_peer *ph,
Sage Weila8599bd2009-10-06 11:31:12 -07003806 struct ceph_mds_session *session,
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003807 struct ceph_cap **target_cap, int *old_issued)
3808 __acquires(ci->i_ceph_lock)
Sage Weila8599bd2009-10-06 11:31:12 -07003809{
3810 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003811 struct ceph_cap *cap, *ocap, *new_cap = NULL;
Sage Weila8599bd2009-10-06 11:31:12 -07003812 int mds = session->s_mds;
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003813 int issued;
3814 unsigned caps = le32_to_cpu(im->caps);
Sage Weila8599bd2009-10-06 11:31:12 -07003815 unsigned wanted = le32_to_cpu(im->wanted);
3816 unsigned seq = le32_to_cpu(im->seq);
3817 unsigned mseq = le32_to_cpu(im->migrate_seq);
3818 u64 realmino = le64_to_cpu(im->realm);
3819 u64 cap_id = le64_to_cpu(im->cap_id);
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003820 u64 p_cap_id;
3821 int peer;
Sage Weila8599bd2009-10-06 11:31:12 -07003822
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003823 if (ph) {
3824 p_cap_id = le64_to_cpu(ph->cap_id);
3825 peer = le32_to_cpu(ph->mds);
Sage Weila8599bd2009-10-06 11:31:12 -07003826 } else {
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003827 p_cap_id = 0;
3828 peer = -1;
Sage Weila8599bd2009-10-06 11:31:12 -07003829 }
3830
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003831 dout("handle_cap_import inode %p ci %p mds%d mseq %d peer %d\n",
3832 inode, ci, mds, mseq, peer);
3833
Yan, Zhengd9df2782014-04-18 09:57:11 +08003834retry:
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003835 spin_lock(&ci->i_ceph_lock);
Yan, Zhengd9df2782014-04-18 09:57:11 +08003836 cap = __get_cap_for_mds(ci, mds);
3837 if (!cap) {
3838 if (!new_cap) {
3839 spin_unlock(&ci->i_ceph_lock);
3840 new_cap = ceph_get_cap(mdsc, NULL);
3841 goto retry;
3842 }
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003843 cap = new_cap;
3844 } else {
3845 if (new_cap) {
3846 ceph_put_cap(mdsc, new_cap);
3847 new_cap = NULL;
3848 }
Yan, Zhengd9df2782014-04-18 09:57:11 +08003849 }
3850
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003851 __ceph_caps_issued(ci, &issued);
3852 issued |= __ceph_caps_dirty(ci);
3853
Yan, Zheng135e6712020-03-05 20:21:02 +08003854 ceph_add_cap(inode, session, cap_id, caps, wanted, seq, mseq,
Yan, Zhengd9df2782014-04-18 09:57:11 +08003855 realmino, CEPH_CAP_FLAG_AUTH, &new_cap);
3856
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003857 ocap = peer >= 0 ? __get_cap_for_mds(ci, peer) : NULL;
3858 if (ocap && ocap->cap_id == p_cap_id) {
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003859 dout(" remove export cap %p mds%d flags %d\n",
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003860 ocap, peer, ph->flags);
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003861 if ((ph->flags & CEPH_CAP_FLAG_AUTH) &&
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003862 (ocap->seq != le32_to_cpu(ph->seq) ||
3863 ocap->mseq != le32_to_cpu(ph->mseq))) {
Yan, Zhengd84b37f2018-01-03 11:16:27 +08003864 pr_err_ratelimited("handle_cap_import: "
3865 "mismatched seq/mseq: ino (%llx.%llx) "
3866 "mds%d seq %d mseq %d importer mds%d "
3867 "has peer seq %d mseq %d\n",
3868 ceph_vinop(inode), peer, ocap->seq,
3869 ocap->mseq, mds, le32_to_cpu(ph->seq),
3870 le32_to_cpu(ph->mseq));
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003871 }
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003872 __ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003873 }
3874
3875 /* make sure we re-request max_size, if necessary */
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003876 ci->i_requested_max_size = 0;
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003877
Yan, Zheng2cd698b2014-04-18 13:20:27 +08003878 *old_issued = issued;
3879 *target_cap = cap;
Sage Weila8599bd2009-10-06 11:31:12 -07003880}
3881
3882/*
3883 * Handle a caps message from the MDS.
3884 *
3885 * Identify the appropriate session, inode, and call the right handler
3886 * based on the cap op.
3887 */
3888void ceph_handle_caps(struct ceph_mds_session *session,
3889 struct ceph_msg *msg)
3890{
3891 struct ceph_mds_client *mdsc = session->s_mdsc;
Sage Weila8599bd2009-10-06 11:31:12 -07003892 struct inode *inode;
Sage Weilbe655592011-11-30 09:47:09 -08003893 struct ceph_inode_info *ci;
Sage Weila8599bd2009-10-06 11:31:12 -07003894 struct ceph_cap *cap;
3895 struct ceph_mds_caps *h;
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003896 struct ceph_mds_cap_peer *peer = NULL;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003897 struct ceph_snap_realm *realm = NULL;
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003898 int op;
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003899 int msg_version = le16_to_cpu(msg->hdr.version);
Sage Weil3d7ded42010-06-09 16:47:10 -07003900 u32 seq, mseq;
Sage Weila8599bd2009-10-06 11:31:12 -07003901 struct ceph_vino vino;
Sage Weil70edb552010-03-01 13:20:50 -08003902 void *snaptrace;
Sage Weilce1fbc82010-08-02 15:09:39 -07003903 size_t snaptrace_len;
Yan, Zhengfb01d1f2014-11-14 21:29:55 +08003904 void *p, *end;
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003905 struct cap_extra_info extra_info = {};
Jeff Layton7391fba2020-03-18 16:43:30 -04003906 bool queue_trunc;
Sage Weila8599bd2009-10-06 11:31:12 -07003907
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003908 dout("handle_caps from mds%d\n", session->s_mds);
Sage Weila8599bd2009-10-06 11:31:12 -07003909
3910 /* decode */
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003911 end = msg->front.iov_base + msg->front.iov_len;
Sage Weila8599bd2009-10-06 11:31:12 -07003912 if (msg->front.iov_len < sizeof(*h))
3913 goto bad;
3914 h = msg->front.iov_base;
3915 op = le32_to_cpu(h->op);
3916 vino.ino = le64_to_cpu(h->ino);
3917 vino.snap = CEPH_NOSNAP;
Sage Weila8599bd2009-10-06 11:31:12 -07003918 seq = le32_to_cpu(h->seq);
Sage Weil3d7ded42010-06-09 16:47:10 -07003919 mseq = le32_to_cpu(h->migrate_seq);
Sage Weila8599bd2009-10-06 11:31:12 -07003920
Sage Weilce1fbc82010-08-02 15:09:39 -07003921 snaptrace = h + 1;
3922 snaptrace_len = le32_to_cpu(h->snap_trace_len);
Yan, Zhengfb01d1f2014-11-14 21:29:55 +08003923 p = snaptrace + snaptrace_len;
Sage Weilce1fbc82010-08-02 15:09:39 -07003924
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003925 if (msg_version >= 2) {
Yan, Zhengfb01d1f2014-11-14 21:29:55 +08003926 u32 flock_len;
Sage Weilce1fbc82010-08-02 15:09:39 -07003927 ceph_decode_32_safe(&p, end, flock_len, bad);
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003928 if (p + flock_len > end)
3929 goto bad;
Yan, Zhengfb01d1f2014-11-14 21:29:55 +08003930 p += flock_len;
Sage Weilce1fbc82010-08-02 15:09:39 -07003931 }
3932
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003933 if (msg_version >= 3) {
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003934 if (op == CEPH_CAP_OP_IMPORT) {
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003935 if (p + sizeof(*peer) > end)
3936 goto bad;
3937 peer = p;
Yan, Zhengfb01d1f2014-11-14 21:29:55 +08003938 p += sizeof(*peer);
Yan, Zheng11df2df2013-11-24 14:44:38 +08003939 } else if (op == CEPH_CAP_OP_EXPORT) {
3940 /* recorded in unused fields */
3941 peer = (void *)&h->size;
Yan, Zheng4ee6a912013-11-24 14:43:46 +08003942 }
3943 }
3944
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003945 if (msg_version >= 4) {
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003946 ceph_decode_64_safe(&p, end, extra_info.inline_version, bad);
3947 ceph_decode_32_safe(&p, end, extra_info.inline_len, bad);
3948 if (p + extra_info.inline_len > end)
Yan, Zhengfb01d1f2014-11-14 21:29:55 +08003949 goto bad;
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003950 extra_info.inline_data = p;
3951 p += extra_info.inline_len;
Yan, Zhengfb01d1f2014-11-14 21:29:55 +08003952 }
3953
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003954 if (msg_version >= 5) {
Jeff Layton92475f02017-04-13 11:07:04 -04003955 struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc;
3956 u32 epoch_barrier;
3957
3958 ceph_decode_32_safe(&p, end, epoch_barrier, bad);
3959 ceph_osdc_update_epoch_barrier(osdc, epoch_barrier);
3960 }
3961
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003962 if (msg_version >= 8) {
Yan, Zheng5ea5c5e2016-02-14 18:06:41 +08003963 u64 flush_tid;
3964 u32 caller_uid, caller_gid;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003965 u32 pool_ns_len;
Jeff Layton92475f02017-04-13 11:07:04 -04003966
Yan, Zheng5ea5c5e2016-02-14 18:06:41 +08003967 /* version >= 6 */
3968 ceph_decode_64_safe(&p, end, flush_tid, bad);
3969 /* version >= 7 */
3970 ceph_decode_32_safe(&p, end, caller_uid, bad);
3971 ceph_decode_32_safe(&p, end, caller_gid, bad);
3972 /* version >= 8 */
3973 ceph_decode_32_safe(&p, end, pool_ns_len, bad);
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003974 if (pool_ns_len > 0) {
3975 ceph_decode_need(&p, end, pool_ns_len, bad);
Yan, Zhenga1c6b832018-04-27 10:29:44 +08003976 extra_info.pool_ns =
3977 ceph_find_or_create_string(p, pool_ns_len);
Yan, Zheng779fe0f2016-03-07 09:35:06 +08003978 p += pool_ns_len;
3979 }
Yan, Zheng5ea5c5e2016-02-14 18:06:41 +08003980 }
3981
Jeff Laytonec62b892019-05-29 12:23:14 -04003982 if (msg_version >= 9) {
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003983 struct ceph_timespec *btime;
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003984
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003985 if (p + sizeof(*btime) > end)
3986 goto bad;
3987 btime = p;
Jeff Laytonec62b892019-05-29 12:23:14 -04003988 ceph_decode_timespec64(&extra_info.btime, btime);
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003989 p += sizeof(*btime);
Jeff Layton176c77c2019-06-06 08:06:40 -04003990 ceph_decode_64_safe(&p, end, extra_info.change_attr, bad);
Jeff Laytonec62b892019-05-29 12:23:14 -04003991 }
3992
3993 if (msg_version >= 11) {
3994 u32 flags;
Yan, Zheng4985d6f2018-04-27 11:11:31 +08003995 /* version >= 10 */
3996 ceph_decode_32_safe(&p, end, flags, bad);
3997 /* version >= 11 */
3998 extra_info.dirstat_valid = true;
3999 ceph_decode_64_safe(&p, end, extra_info.nfiles, bad);
4000 ceph_decode_64_safe(&p, end, extra_info.nsubdirs, bad);
4001 }
4002
Yan, Zheng6cd3bca2014-09-17 07:45:12 +08004003 /* lookup ino */
Yan, Zhenga1c6b832018-04-27 10:29:44 +08004004 inode = ceph_find_inode(mdsc->fsc->sb, vino);
Yan, Zheng6cd3bca2014-09-17 07:45:12 +08004005 ci = ceph_inode(inode);
4006 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
4007 vino.snap, inode);
4008
Sage Weila8599bd2009-10-06 11:31:12 -07004009 mutex_lock(&session->s_mutex);
4010 session->s_seq++;
4011 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
4012 (unsigned)seq);
4013
Sage Weila8599bd2009-10-06 11:31:12 -07004014 if (!inode) {
4015 dout(" i don't have ino %llx\n", vino.ino);
Sage Weil3d7ded42010-06-09 16:47:10 -07004016
Yan, Zhenga096b092013-09-22 10:15:58 +08004017 if (op == CEPH_CAP_OP_IMPORT) {
Yan, Zheng745a8e32015-05-14 17:22:42 +08004018 cap = ceph_get_cap(mdsc, NULL);
4019 cap->cap_ino = vino.ino;
4020 cap->queue_release = 1;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08004021 cap->cap_id = le64_to_cpu(h->cap_id);
Yan, Zheng745a8e32015-05-14 17:22:42 +08004022 cap->mseq = mseq;
4023 cap->seq = seq;
Yan, Zhengdc24de82016-11-17 19:55:30 +08004024 cap->issue_seq = seq;
Yan, Zhenga096b092013-09-22 10:15:58 +08004025 spin_lock(&session->s_cap_lock);
Yan, Zhenge3ec8d62019-01-14 17:21:19 +08004026 __ceph_queue_cap_release(session, cap);
Yan, Zhenga096b092013-09-22 10:15:58 +08004027 spin_unlock(&session->s_cap_lock);
4028 }
Jeff Laytonfb33c112020-05-20 10:36:07 -04004029 goto flush_cap_releases;
Sage Weila8599bd2009-10-06 11:31:12 -07004030 }
4031
4032 /* these will work even if we don't have a cap yet */
4033 switch (op) {
4034 case CEPH_CAP_OP_FLUSHSNAP_ACK:
Yan, Zhenga1c6b832018-04-27 10:29:44 +08004035 handle_cap_flushsnap_ack(inode, le64_to_cpu(msg->hdr.tid),
4036 h, session);
Sage Weila8599bd2009-10-06 11:31:12 -07004037 goto done;
4038
4039 case CEPH_CAP_OP_EXPORT:
Yan, Zheng11df2df2013-11-24 14:44:38 +08004040 handle_cap_export(inode, h, peer, session);
4041 goto done_unlocked;
Sage Weila8599bd2009-10-06 11:31:12 -07004042
4043 case CEPH_CAP_OP_IMPORT:
Yan, Zheng982d6012014-12-23 15:30:54 +08004044 realm = NULL;
4045 if (snaptrace_len) {
4046 down_write(&mdsc->snap_rwsem);
4047 ceph_update_snap_trace(mdsc, snaptrace,
4048 snaptrace + snaptrace_len,
4049 false, &realm);
4050 downgrade_write(&mdsc->snap_rwsem);
4051 } else {
4052 down_read(&mdsc->snap_rwsem);
4053 }
Yan, Zheng4ee6a912013-11-24 14:43:46 +08004054 handle_cap_import(mdsc, inode, h, peer, session,
Yan, Zhenga1c6b832018-04-27 10:29:44 +08004055 &cap, &extra_info.issued);
4056 handle_cap_grant(inode, session, cap,
4057 h, msg->middle, &extra_info);
Yan, Zheng982d6012014-12-23 15:30:54 +08004058 if (realm)
4059 ceph_put_snap_realm(mdsc, realm);
Yan, Zheng2cd698b2014-04-18 13:20:27 +08004060 goto done_unlocked;
Sage Weila8599bd2009-10-06 11:31:12 -07004061 }
4062
4063 /* the rest require a cap */
Sage Weilbe655592011-11-30 09:47:09 -08004064 spin_lock(&ci->i_ceph_lock);
Yan, Zhenga1c6b832018-04-27 10:29:44 +08004065 cap = __get_cap_for_mds(ceph_inode(inode), session->s_mds);
Sage Weila8599bd2009-10-06 11:31:12 -07004066 if (!cap) {
Sage Weil9dbd4122010-06-10 13:21:20 -07004067 dout(" no cap on %p ino %llx.%llx from mds%d\n",
Yan, Zhenga1c6b832018-04-27 10:29:44 +08004068 inode, ceph_ino(inode), ceph_snap(inode),
4069 session->s_mds);
Sage Weilbe655592011-11-30 09:47:09 -08004070 spin_unlock(&ci->i_ceph_lock);
Greg Farnum21b559d2010-10-06 15:46:30 -07004071 goto flush_cap_releases;
Sage Weila8599bd2009-10-06 11:31:12 -07004072 }
4073
Sage Weilbe655592011-11-30 09:47:09 -08004074 /* note that each of these drops i_ceph_lock for us */
Sage Weila8599bd2009-10-06 11:31:12 -07004075 switch (op) {
4076 case CEPH_CAP_OP_REVOKE:
4077 case CEPH_CAP_OP_GRANT:
Yan, Zhenga1c6b832018-04-27 10:29:44 +08004078 __ceph_caps_issued(ci, &extra_info.issued);
4079 extra_info.issued |= __ceph_caps_dirty(ci);
4080 handle_cap_grant(inode, session, cap,
4081 h, msg->middle, &extra_info);
Sage Weil15637c82010-03-16 13:42:00 -07004082 goto done_unlocked;
Sage Weila8599bd2009-10-06 11:31:12 -07004083
4084 case CEPH_CAP_OP_FLUSH_ACK:
Yan, Zhenga1c6b832018-04-27 10:29:44 +08004085 handle_cap_flush_ack(inode, le64_to_cpu(msg->hdr.tid),
4086 h, session, cap);
Sage Weila8599bd2009-10-06 11:31:12 -07004087 break;
4088
4089 case CEPH_CAP_OP_TRUNC:
Jeff Layton7391fba2020-03-18 16:43:30 -04004090 queue_trunc = handle_cap_trunc(inode, h, session);
4091 spin_unlock(&ci->i_ceph_lock);
4092 if (queue_trunc)
4093 ceph_queue_vmtruncate(inode);
Sage Weila8599bd2009-10-06 11:31:12 -07004094 break;
4095
4096 default:
Sage Weilbe655592011-11-30 09:47:09 -08004097 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07004098 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
4099 ceph_cap_op_name(op));
4100 }
4101
Yan, Zhenge3ec8d62019-01-14 17:21:19 +08004102done:
4103 mutex_unlock(&session->s_mutex);
4104done_unlocked:
Yan, Zhenge3ec8d62019-01-14 17:21:19 +08004105 ceph_put_string(extra_info.pool_ns);
Yan, Zheng3e1d0452019-05-18 20:39:55 +08004106 /* avoid calling iput_final() in mds dispatch threads */
4107 ceph_async_iput(inode);
Yan, Zhenge3ec8d62019-01-14 17:21:19 +08004108 return;
Greg Farnum21b559d2010-10-06 15:46:30 -07004109
4110flush_cap_releases:
4111 /*
Yan, Zheng745a8e32015-05-14 17:22:42 +08004112 * send any cap release message to try to move things
Greg Farnum21b559d2010-10-06 15:46:30 -07004113 * along for the mds (who clearly thinks we still have this
4114 * cap).
4115 */
Yan, Zhenge3ec8d62019-01-14 17:21:19 +08004116 ceph_flush_cap_releases(mdsc, session);
4117 goto done;
Sage Weila8599bd2009-10-06 11:31:12 -07004118
4119bad:
4120 pr_err("ceph_handle_caps: corrupt message\n");
Sage Weil9ec7cab2009-12-14 15:13:47 -08004121 ceph_msg_dump(msg);
Sage Weila8599bd2009-10-06 11:31:12 -07004122 return;
4123}
4124
4125/*
4126 * Delayed work handler to process end of delayed cap release LRU list.
4127 */
Sage Weilafcdaea2009-10-14 14:27:38 -07004128void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
Sage Weila8599bd2009-10-06 11:31:12 -07004129{
Yan, Zheng4b9f2042017-06-27 17:17:24 +08004130 struct inode *inode;
Sage Weila8599bd2009-10-06 11:31:12 -07004131 struct ceph_inode_info *ci;
Sage Weila8599bd2009-10-06 11:31:12 -07004132
Sage Weila8599bd2009-10-06 11:31:12 -07004133 dout("check_delayed_caps\n");
4134 while (1) {
4135 spin_lock(&mdsc->cap_delay_lock);
4136 if (list_empty(&mdsc->cap_delay_list))
4137 break;
4138 ci = list_first_entry(&mdsc->cap_delay_list,
4139 struct ceph_inode_info,
4140 i_cap_delay_list);
4141 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
4142 time_before(jiffies, ci->i_hold_caps_max))
4143 break;
4144 list_del_init(&ci->i_cap_delay_list);
Yan, Zheng4b9f2042017-06-27 17:17:24 +08004145
4146 inode = igrab(&ci->vfs_inode);
Sage Weila8599bd2009-10-06 11:31:12 -07004147 spin_unlock(&mdsc->cap_delay_lock);
Yan, Zheng4b9f2042017-06-27 17:17:24 +08004148
4149 if (inode) {
4150 dout("check_delayed_caps on %p\n", inode);
Yan, Zhenga0d93e32020-03-05 20:21:01 +08004151 ceph_check_caps(ci, 0, NULL);
Yan, Zheng3e1d0452019-05-18 20:39:55 +08004152 /* avoid calling iput_final() in tick thread */
4153 ceph_async_iput(inode);
Yan, Zheng4b9f2042017-06-27 17:17:24 +08004154 }
Sage Weila8599bd2009-10-06 11:31:12 -07004155 }
4156 spin_unlock(&mdsc->cap_delay_lock);
4157}
4158
4159/*
Sage Weilafcdaea2009-10-14 14:27:38 -07004160 * Flush all dirty caps to the mds
4161 */
4162void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
4163{
Sage Weildb354052011-05-24 11:46:31 -07004164 struct ceph_inode_info *ci;
4165 struct inode *inode;
Sage Weilafcdaea2009-10-14 14:27:38 -07004166
4167 dout("flush_dirty_caps\n");
4168 spin_lock(&mdsc->cap_dirty_lock);
Sage Weildb354052011-05-24 11:46:31 -07004169 while (!list_empty(&mdsc->cap_dirty)) {
4170 ci = list_first_entry(&mdsc->cap_dirty, struct ceph_inode_info,
4171 i_dirty_item);
Sage Weil70b666c2011-05-27 09:24:26 -07004172 inode = &ci->vfs_inode;
4173 ihold(inode);
Sage Weildb354052011-05-24 11:46:31 -07004174 dout("flush_dirty_caps %p\n", inode);
Sage Weilafcdaea2009-10-14 14:27:38 -07004175 spin_unlock(&mdsc->cap_dirty_lock);
Yan, Zhenga0d93e32020-03-05 20:21:01 +08004176 ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL);
Sage Weil70b666c2011-05-27 09:24:26 -07004177 iput(inode);
Sage Weilafcdaea2009-10-14 14:27:38 -07004178 spin_lock(&mdsc->cap_dirty_lock);
4179 }
4180 spin_unlock(&mdsc->cap_dirty_lock);
Sage Weildb354052011-05-24 11:46:31 -07004181 dout("flush_dirty_caps done\n");
Sage Weilafcdaea2009-10-14 14:27:38 -07004182}
4183
Yan, Zheng719a2512020-03-05 20:21:00 +08004184void __ceph_touch_fmode(struct ceph_inode_info *ci,
4185 struct ceph_mds_client *mdsc, int fmode)
4186{
4187 unsigned long now = jiffies;
4188 if (fmode & CEPH_FILE_MODE_RD)
4189 ci->i_last_rd = now;
4190 if (fmode & CEPH_FILE_MODE_WR)
4191 ci->i_last_wr = now;
4192 /* queue periodic check */
4193 if (fmode &&
4194 __ceph_is_any_real_caps(ci) &&
4195 list_empty(&ci->i_cap_delay_list))
Yan, Zhenga0d93e32020-03-05 20:21:01 +08004196 __cap_delay_requeue(mdsc, ci);
Yan, Zheng719a2512020-03-05 20:21:00 +08004197}
4198
4199void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count)
4200{
4201 int i;
4202 int bits = (fmode << 1) | 1;
4203 spin_lock(&ci->i_ceph_lock);
4204 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4205 if (bits & (1 << i))
4206 ci->i_nr_by_mode[i] += count;
4207 }
4208 spin_unlock(&ci->i_ceph_lock);
4209}
4210
Sage Weilafcdaea2009-10-14 14:27:38 -07004211/*
Sage Weila8599bd2009-10-06 11:31:12 -07004212 * Drop open file reference. If we were the last open file,
4213 * we may need to release capabilities to the MDS (or schedule
4214 * their delayed release).
4215 */
Yan, Zheng719a2512020-03-05 20:21:00 +08004216void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count)
Sage Weila8599bd2009-10-06 11:31:12 -07004217{
Yan, Zheng719a2512020-03-05 20:21:00 +08004218 int i;
Yan, Zheng774a6a12016-06-06 16:01:39 +08004219 int bits = (fmode << 1) | 1;
Sage Weilbe655592011-11-30 09:47:09 -08004220 spin_lock(&ci->i_ceph_lock);
Yan, Zheng774a6a12016-06-06 16:01:39 +08004221 for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
4222 if (bits & (1 << i)) {
Yan, Zheng719a2512020-03-05 20:21:00 +08004223 BUG_ON(ci->i_nr_by_mode[i] < count);
4224 ci->i_nr_by_mode[i] -= count;
Yan, Zheng774a6a12016-06-06 16:01:39 +08004225 }
4226 }
Sage Weilbe655592011-11-30 09:47:09 -08004227 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07004228}
4229
4230/*
Jeff Laytona452bc02019-04-02 14:20:24 -04004231 * For a soon-to-be unlinked file, drop the LINK caps. If it
Zhi Zhang6ef0bc62018-01-24 21:24:33 +08004232 * looks like the link count will hit 0, drop any other caps (other
4233 * than PIN) we don't specifically want (due to the file still being
4234 * open).
4235 */
4236int ceph_drop_caps_for_unlink(struct inode *inode)
4237{
4238 struct ceph_inode_info *ci = ceph_inode(inode);
4239 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
4240
4241 spin_lock(&ci->i_ceph_lock);
4242 if (inode->i_nlink == 1) {
4243 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
4244
Zhi Zhang6ef0bc62018-01-24 21:24:33 +08004245 if (__ceph_caps_dirty(ci)) {
4246 struct ceph_mds_client *mdsc =
4247 ceph_inode_to_client(inode)->mdsc;
4248 __cap_delay_requeue_front(mdsc, ci);
4249 }
4250 }
4251 spin_unlock(&ci->i_ceph_lock);
4252 return drop;
4253}
4254
4255/*
Sage Weila8599bd2009-10-06 11:31:12 -07004256 * Helpers for embedding cap and dentry lease releases into mds
4257 * requests.
4258 *
4259 * @force is used by dentry_release (below) to force inclusion of a
4260 * record for the directory inode, even when there aren't any caps to
4261 * drop.
4262 */
4263int ceph_encode_inode_release(void **p, struct inode *inode,
4264 int mds, int drop, int unless, int force)
4265{
4266 struct ceph_inode_info *ci = ceph_inode(inode);
4267 struct ceph_cap *cap;
4268 struct ceph_mds_request_release *rel = *p;
Sage Weilec97f882010-06-24 15:12:37 -07004269 int used, dirty;
Sage Weila8599bd2009-10-06 11:31:12 -07004270 int ret = 0;
Sage Weila8599bd2009-10-06 11:31:12 -07004271
Sage Weilbe655592011-11-30 09:47:09 -08004272 spin_lock(&ci->i_ceph_lock);
Sage Weil916623d2010-03-16 15:01:07 -07004273 used = __ceph_caps_used(ci);
Sage Weilec97f882010-06-24 15:12:37 -07004274 dirty = __ceph_caps_dirty(ci);
Sage Weil916623d2010-03-16 15:01:07 -07004275
Sage Weilec97f882010-06-24 15:12:37 -07004276 dout("encode_inode_release %p mds%d used|dirty %s drop %s unless %s\n",
4277 inode, mds, ceph_cap_string(used|dirty), ceph_cap_string(drop),
Sage Weil916623d2010-03-16 15:01:07 -07004278 ceph_cap_string(unless));
4279
Sage Weilec97f882010-06-24 15:12:37 -07004280 /* only drop unused, clean caps */
4281 drop &= ~(used | dirty);
Sage Weil916623d2010-03-16 15:01:07 -07004282
Sage Weila8599bd2009-10-06 11:31:12 -07004283 cap = __get_cap_for_mds(ci, mds);
4284 if (cap && __cap_is_valid(cap)) {
Yan, Zheng222b7f92017-11-23 17:47:15 +08004285 unless &= cap->issued;
4286 if (unless) {
4287 if (unless & CEPH_CAP_AUTH_EXCL)
4288 drop &= ~CEPH_CAP_AUTH_SHARED;
4289 if (unless & CEPH_CAP_LINK_EXCL)
4290 drop &= ~CEPH_CAP_LINK_SHARED;
4291 if (unless & CEPH_CAP_XATTR_EXCL)
4292 drop &= ~CEPH_CAP_XATTR_SHARED;
4293 if (unless & CEPH_CAP_FILE_EXCL)
4294 drop &= ~CEPH_CAP_FILE_SHARED;
4295 }
4296
4297 if (force || (cap->issued & drop)) {
4298 if (cap->issued & drop) {
Yan, Zhengbb137f82013-06-03 18:22:17 +08004299 int wanted = __ceph_caps_wanted(ci);
Yan, Zhengbb137f82013-06-03 18:22:17 +08004300 dout("encode_inode_release %p cap %p "
4301 "%s -> %s, wanted %s -> %s\n", inode, cap,
Sage Weila8599bd2009-10-06 11:31:12 -07004302 ceph_cap_string(cap->issued),
Yan, Zhengbb137f82013-06-03 18:22:17 +08004303 ceph_cap_string(cap->issued & ~drop),
4304 ceph_cap_string(cap->mds_wanted),
4305 ceph_cap_string(wanted));
4306
Sage Weila8599bd2009-10-06 11:31:12 -07004307 cap->issued &= ~drop;
4308 cap->implemented &= ~drop;
Yan, Zhengbb137f82013-06-03 18:22:17 +08004309 cap->mds_wanted = wanted;
Sage Weila8599bd2009-10-06 11:31:12 -07004310 } else {
4311 dout("encode_inode_release %p cap %p %s"
4312 " (force)\n", inode, cap,
4313 ceph_cap_string(cap->issued));
4314 }
4315
4316 rel->ino = cpu_to_le64(ceph_ino(inode));
4317 rel->cap_id = cpu_to_le64(cap->cap_id);
4318 rel->seq = cpu_to_le32(cap->seq);
Himangi Saraogi08a0f242014-07-23 20:11:11 +05304319 rel->issue_seq = cpu_to_le32(cap->issue_seq);
Sage Weila8599bd2009-10-06 11:31:12 -07004320 rel->mseq = cpu_to_le32(cap->mseq);
Yan, Zhengfd7b95c2014-04-17 08:02:02 +08004321 rel->caps = cpu_to_le32(cap->implemented);
Sage Weila8599bd2009-10-06 11:31:12 -07004322 rel->wanted = cpu_to_le32(cap->mds_wanted);
4323 rel->dname_len = 0;
4324 rel->dname_seq = 0;
4325 *p += sizeof(*rel);
4326 ret = 1;
4327 } else {
Yan, Zheng222b7f92017-11-23 17:47:15 +08004328 dout("encode_inode_release %p cap %p %s (noop)\n",
Sage Weila8599bd2009-10-06 11:31:12 -07004329 inode, cap, ceph_cap_string(cap->issued));
4330 }
4331 }
Sage Weilbe655592011-11-30 09:47:09 -08004332 spin_unlock(&ci->i_ceph_lock);
Sage Weila8599bd2009-10-06 11:31:12 -07004333 return ret;
4334}
4335
4336int ceph_encode_dentry_release(void **p, struct dentry *dentry,
Jeff Laytonca6c8ae2016-12-15 08:37:59 -05004337 struct inode *dir,
Sage Weila8599bd2009-10-06 11:31:12 -07004338 int mds, int drop, int unless)
4339{
Jeff Laytonca6c8ae2016-12-15 08:37:59 -05004340 struct dentry *parent = NULL;
Sage Weila8599bd2009-10-06 11:31:12 -07004341 struct ceph_mds_request_release *rel = *p;
4342 struct ceph_dentry_info *di = ceph_dentry(dentry);
4343 int force = 0;
4344 int ret;
4345
4346 /*
4347 * force an record for the directory caps if we have a dentry lease.
Sage Weilbe655592011-11-30 09:47:09 -08004348 * this is racy (can't take i_ceph_lock and d_lock together), but it
Sage Weila8599bd2009-10-06 11:31:12 -07004349 * doesn't have to be perfect; the mds will revoke anything we don't
4350 * release.
4351 */
4352 spin_lock(&dentry->d_lock);
4353 if (di->lease_session && di->lease_session->s_mds == mds)
4354 force = 1;
Jeff Laytonca6c8ae2016-12-15 08:37:59 -05004355 if (!dir) {
4356 parent = dget(dentry->d_parent);
4357 dir = d_inode(parent);
4358 }
Sage Weila8599bd2009-10-06 11:31:12 -07004359 spin_unlock(&dentry->d_lock);
4360
Jeff Laytonca6c8ae2016-12-15 08:37:59 -05004361 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
Jeff Laytonadf0d682016-12-15 08:37:58 -05004362 dput(parent);
Sage Weila8599bd2009-10-06 11:31:12 -07004363
4364 spin_lock(&dentry->d_lock);
4365 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
4366 dout("encode_dentry_release %p mds%d seq %d\n",
4367 dentry, mds, (int)di->lease_seq);
4368 rel->dname_len = cpu_to_le32(dentry->d_name.len);
4369 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
4370 *p += dentry->d_name.len;
4371 rel->dname_seq = cpu_to_le32(di->lease_seq);
Sage Weil1dadcce2010-07-23 13:54:21 -07004372 __ceph_mdsc_drop_dentry_lease(dentry);
Sage Weila8599bd2009-10-06 11:31:12 -07004373 }
4374 spin_unlock(&dentry->d_lock);
4375 return ret;
4376}