blob: 04bbe853bcb1a9b566f703d904d6474fec1d4139 [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001// SPDX-License-Identifier: GPL-2.0
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -07002#include <linux/ceph/ceph_debug.h>
Sage Weil1d3576f2009-10-06 11:31:09 -07003
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/mm.h>
7#include <linux/pagemap.h>
8#include <linux/writeback.h> /* generic_writepages */
Tejun Heo5a0e3ad2010-03-24 17:04:11 +09009#include <linux/slab.h>
Sage Weil1d3576f2009-10-06 11:31:09 -070010#include <linux/pagevec.h>
11#include <linux/task_io_accounting_ops.h>
Ingo Molnarf361bf42017-02-03 23:47:37 +010012#include <linux/signal.h>
Jeff Layton5c308352019-06-06 08:57:27 -040013#include <linux/iversion.h>
Xiubo Li97e27aa2020-03-19 23:45:01 -040014#include <linux/ktime.h>
Jeff Laytonf0702872020-06-01 10:10:21 -040015#include <linux/netfs.h>
Sage Weil1d3576f2009-10-06 11:31:09 -070016
17#include "super.h"
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -070018#include "mds_client.h"
Milosz Tanski99ccbd22013-08-21 17:29:54 -040019#include "cache.h"
Xiubo Li97e27aa2020-03-19 23:45:01 -040020#include "metric.h"
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -070021#include <linux/ceph/osd_client.h>
Ilya Dryomov08c1ac52018-02-17 10:41:20 +010022#include <linux/ceph/striper.h>
Sage Weil1d3576f2009-10-06 11:31:09 -070023
24/*
25 * Ceph address space ops.
26 *
27 * There are a few funny things going on here.
28 *
29 * The page->private field is used to reference a struct
30 * ceph_snap_context for _every_ dirty page. This indicates which
31 * snapshot the page was logically dirtied in, and thus which snap
32 * context needs to be associated with the osd write during writeback.
33 *
34 * Similarly, struct ceph_inode_info maintains a set of counters to
Lucas De Marchi25985ed2011-03-30 22:57:33 -030035 * count dirty pages on the inode. In the absence of snapshots,
Sage Weil1d3576f2009-10-06 11:31:09 -070036 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
37 *
38 * When a snapshot is taken (that is, when the client receives
39 * notification that a snapshot was taken), each inode with caps and
40 * with dirty pages (dirty pages implies there is a cap) gets a new
41 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
42 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
43 * moved to capsnap->dirty. (Unless a sync write is currently in
44 * progress. In that case, the capsnap is said to be "pending", new
45 * writes cannot start, and the capsnap isn't "finalized" until the
46 * write completes (or fails) and a final size/mtime for the inode for
47 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
48 *
49 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
50 * we look for the first capsnap in i_cap_snaps and write out pages in
51 * that snap context _only_. Then we move on to the next capsnap,
52 * eventually reaching the "live" or "head" context (i.e., pages that
53 * are not yet snapped) and are writing the most recently dirtied
54 * pages.
55 *
56 * Invalidate and so forth must take care to ensure the dirty page
57 * accounting is preserved.
58 */
59
Yehuda Sadeh2baba252009-12-18 13:51:57 -080060#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
61#define CONGESTION_OFF_THRESH(congestion_kb) \
62 (CONGESTION_ON_THRESH(congestion_kb) - \
63 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
64
Jeff Laytond8013272020-06-05 10:43:21 -040065static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
David Howells78525c72021-08-11 09:49:13 +010066 struct folio *folio, void **_fsdata);
Jeff Laytond8013272020-06-05 10:43:21 -040067
Yan, Zheng61600ef2012-05-28 14:44:30 +080068static inline struct ceph_snap_context *page_snap_context(struct page *page)
69{
70 if (PagePrivate(page))
71 return (void *)page->private;
72 return NULL;
73}
Sage Weil1d3576f2009-10-06 11:31:09 -070074
75/*
76 * Dirty a page. Optimistically adjust accounting, on the assumption
77 * that we won't race with invalidate. If we do, readjust.
78 */
79static int ceph_set_page_dirty(struct page *page)
80{
81 struct address_space *mapping = page->mapping;
82 struct inode *inode;
83 struct ceph_inode_info *ci;
Sage Weil1d3576f2009-10-06 11:31:09 -070084 struct ceph_snap_context *snapc;
Sage Weil1d3576f2009-10-06 11:31:09 -070085
Sha Zhengju7d6e1f52013-08-21 16:27:34 +080086 if (PageDirty(page)) {
Sage Weil1d3576f2009-10-06 11:31:09 -070087 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
88 mapping->host, page, page->index);
Sha Zhengju7d6e1f52013-08-21 16:27:34 +080089 BUG_ON(!PagePrivate(page));
Sage Weil1d3576f2009-10-06 11:31:09 -070090 return 0;
91 }
92
93 inode = mapping->host;
94 ci = ceph_inode(inode);
95
Sage Weil1d3576f2009-10-06 11:31:09 -070096 /* dirty the head */
Sage Weilbe655592011-11-30 09:47:09 -080097 spin_lock(&ci->i_ceph_lock);
Yan, Zheng5dda377c2015-04-30 14:40:54 +080098 BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference
99 if (__ceph_have_pending_cap_snap(ci)) {
100 struct ceph_cap_snap *capsnap =
101 list_last_entry(&ci->i_cap_snaps,
102 struct ceph_cap_snap,
103 ci_item);
104 snapc = ceph_get_snap_context(capsnap->context);
105 capsnap->dirty_pages++;
106 } else {
107 BUG_ON(!ci->i_head_snapc);
108 snapc = ceph_get_snap_context(ci->i_head_snapc);
109 ++ci->i_wrbuffer_ref_head;
110 }
Sage Weil1d3576f2009-10-06 11:31:09 -0700111 if (ci->i_wrbuffer_ref == 0)
Dave Chinner0444d762011-03-29 18:08:50 +1100112 ihold(inode);
Sage Weil1d3576f2009-10-06 11:31:09 -0700113 ++ci->i_wrbuffer_ref;
114 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
115 "snapc %p seq %lld (%d snaps)\n",
116 mapping->host, page, page->index,
117 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
118 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
119 snapc, snapc->seq, snapc->num_snaps);
Sage Weilbe655592011-11-30 09:47:09 -0800120 spin_unlock(&ci->i_ceph_lock);
Sage Weil1d3576f2009-10-06 11:31:09 -0700121
Sha Zhengju7d6e1f52013-08-21 16:27:34 +0800122 /*
123 * Reference snap context in page->private. Also set
124 * PagePrivate so that we get invalidatepage callback.
125 */
126 BUG_ON(PagePrivate(page));
Jeff Layton379fc7f2021-03-23 15:16:52 -0400127 attach_page_private(page, snapc);
Sage Weil1d3576f2009-10-06 11:31:09 -0700128
Jeff Layton22d41cd2021-05-04 10:08:30 -0400129 return __set_page_dirty_nobuffers(page);
Sage Weil1d3576f2009-10-06 11:31:09 -0700130}
131
132/*
133 * If we are truncating the full page (i.e. offset == 0), adjust the
134 * dirty page counters appropriately. Only called if there is private
135 * data on the page.
136 */
Lukas Czernerd47992f2013-05-21 23:17:23 -0400137static void ceph_invalidatepage(struct page *page, unsigned int offset,
138 unsigned int length)
Sage Weil1d3576f2009-10-06 11:31:09 -0700139{
Alexander Beregalov4ce1e9a2010-02-22 17:17:44 +0300140 struct inode *inode;
Sage Weil1d3576f2009-10-06 11:31:09 -0700141 struct ceph_inode_info *ci;
Jeff Layton379fc7f2021-03-23 15:16:52 -0400142 struct ceph_snap_context *snapc;
Sage Weil1d3576f2009-10-06 11:31:09 -0700143
Jeff Layton7c46b312021-01-21 16:27:14 -0500144 wait_on_page_fscache(page);
145
Alexander Beregalov4ce1e9a2010-02-22 17:17:44 +0300146 inode = page->mapping->host;
Milosz Tanskib150f5c12013-08-09 12:59:55 -0400147 ci = ceph_inode(inode);
148
Jeff Layton8ff2d292021-04-05 10:40:56 -0400149 if (offset != 0 || length != thp_size(page)) {
Milosz Tanskib150f5c12013-08-09 12:59:55 -0400150 dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n",
151 inode, page, page->index, offset, length);
152 return;
153 }
Alexander Beregalov4ce1e9a2010-02-22 17:17:44 +0300154
Yan, Zhengb072d772017-08-30 11:27:29 +0800155 WARN_ON(!PageLocked(page));
Milosz Tanski99ccbd22013-08-21 17:29:54 -0400156 if (!PagePrivate(page))
157 return;
158
Milosz Tanskib150f5c12013-08-09 12:59:55 -0400159 dout("%p invalidatepage %p idx %lu full dirty page\n",
160 inode, page, page->index);
161
Jeff Layton379fc7f2021-03-23 15:16:52 -0400162 snapc = detach_page_private(page);
Milosz Tanskib150f5c12013-08-09 12:59:55 -0400163 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
164 ceph_put_snap_context(snapc);
Sage Weil1d3576f2009-10-06 11:31:09 -0700165}
166
Jeff Layton7c46b312021-01-21 16:27:14 -0500167static int ceph_releasepage(struct page *page, gfp_t gfp)
Sage Weil1d3576f2009-10-06 11:31:09 -0700168{
NeilBrowne55f1a12016-08-31 12:59:29 +1000169 dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host,
170 page, page->index, PageDirty(page) ? "" : "not ");
Milosz Tanski99ccbd22013-08-21 17:29:54 -0400171
Jeff Layton7c46b312021-01-21 16:27:14 -0500172 if (PageFsCache(page)) {
173 if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS))
174 return 0;
175 wait_on_page_fscache(page);
176 }
Milosz Tanski99ccbd22013-08-21 17:29:54 -0400177 return !PagePrivate(page);
Sage Weil1d3576f2009-10-06 11:31:09 -0700178}
179
Jeff Laytonf0702872020-06-01 10:10:21 -0400180static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
181{
182 struct inode *inode = rreq->mapping->host;
183 struct ceph_inode_info *ci = ceph_inode(inode);
184 struct ceph_file_layout *lo = &ci->i_layout;
185 u32 blockoff;
186 u64 blockno;
187
188 /* Expand the start downward */
189 blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff);
190 rreq->start = blockno * lo->stripe_unit;
191 rreq->len += blockoff;
192
193 /* Now, round up the length to the next block */
194 rreq->len = roundup(rreq->len, lo->stripe_unit);
195}
196
197static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
198{
199 struct inode *inode = subreq->rreq->mapping->host;
200 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
201 struct ceph_inode_info *ci = ceph_inode(inode);
202 u64 objno, objoff;
203 u32 xlen;
204
205 /* Truncate the extent at the end of the current block */
206 ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len,
207 &objno, &objoff, &xlen);
208 subreq->len = min(xlen, fsc->mount_options->rsize);
209 return true;
210}
211
212static void finish_netfs_read(struct ceph_osd_request *req)
213{
214 struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
215 struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
216 struct netfs_read_subrequest *subreq = req->r_priv;
217 int num_pages;
218 int err = req->r_result;
219
Xiubo Li8ae99ae2021-03-22 20:28:49 +0800220 ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
Xiubo Li903f4fec2021-05-13 09:40:53 +0800221 req->r_end_latency, osd_data->length, err);
Jeff Laytonf0702872020-06-01 10:10:21 -0400222
223 dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result,
224 subreq->len, i_size_read(req->r_inode));
225
226 /* no object means success but no data */
227 if (err == -ENOENT)
228 err = 0;
229 else if (err == -EBLOCKLISTED)
230 fsc->blocklisted = true;
231
232 if (err >= 0 && err < subreq->len)
233 __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
234
235 netfs_subreq_terminated(subreq, err, true);
236
237 num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
238 ceph_put_page_vector(osd_data->pages, num_pages, false);
239 iput(req->r_inode);
240}
241
242static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
243{
244 struct netfs_read_request *rreq = subreq->rreq;
245 struct inode *inode = rreq->mapping->host;
246 struct ceph_inode_info *ci = ceph_inode(inode);
247 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
248 struct ceph_osd_request *req;
249 struct ceph_vino vino = ceph_vino(inode);
250 struct iov_iter iter;
251 struct page **pages;
252 size_t page_off;
253 int err = 0;
254 u64 len = subreq->len;
255
256 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
257 0, 1, CEPH_OSD_OP_READ,
258 CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
259 NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
260 if (IS_ERR(req)) {
261 err = PTR_ERR(req);
262 req = NULL;
263 goto out;
264 }
265
266 dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);
267 iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
268 err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off);
269 if (err < 0) {
270 dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
271 goto out;
272 }
273
274 /* should always give us a page-aligned read */
275 WARN_ON_ONCE(page_off);
276 len = err;
277
278 osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
279 req->r_callback = finish_netfs_read;
280 req->r_priv = subreq;
281 req->r_inode = inode;
282 ihold(inode);
283
284 err = ceph_osdc_start_request(req->r_osdc, req, false);
285 if (err)
286 iput(inode);
287out:
288 ceph_osdc_put_request(req);
289 if (err)
290 netfs_subreq_terminated(subreq, err, false);
291 dout("%s: result %d\n", __func__, err);
292}
293
294static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file)
295{
296}
297
Jeff Layton49870052020-07-09 14:43:23 -0400298static void ceph_readahead_cleanup(struct address_space *mapping, void *priv)
299{
300 struct inode *inode = mapping->host;
301 struct ceph_inode_info *ci = ceph_inode(inode);
302 int got = (uintptr_t)priv;
303
304 if (got)
305 ceph_put_cap_refs(ci, got);
306}
307
Wei Yongjun675d4d82021-05-14 06:39:53 +0000308static const struct netfs_read_request_ops ceph_netfs_read_ops = {
Jeff Laytonf0702872020-06-01 10:10:21 -0400309 .init_rreq = ceph_init_rreq,
310 .is_cache_enabled = ceph_is_cache_enabled,
311 .begin_cache_operation = ceph_begin_cache_operation,
312 .issue_op = ceph_netfs_issue_op,
313 .expand_readahead = ceph_netfs_expand_readahead,
314 .clamp_length = ceph_netfs_clamp_length,
Jeff Laytond8013272020-06-05 10:43:21 -0400315 .check_write_begin = ceph_netfs_check_write_begin,
Jeff Layton49870052020-07-09 14:43:23 -0400316 .cleanup = ceph_readahead_cleanup,
Jeff Laytonf0702872020-06-01 10:10:21 -0400317};
318
319/* read a single page, without unlocking it. */
David Howells78525c72021-08-11 09:49:13 +0100320static int ceph_readpage(struct file *file, struct page *subpage)
Jeff Laytonf0702872020-06-01 10:10:21 -0400321{
David Howells78525c72021-08-11 09:49:13 +0100322 struct folio *folio = page_folio(subpage);
Jeff Laytonf0702872020-06-01 10:10:21 -0400323 struct inode *inode = file_inode(file);
324 struct ceph_inode_info *ci = ceph_inode(inode);
325 struct ceph_vino vino = ceph_vino(inode);
David Howells78525c72021-08-11 09:49:13 +0100326 size_t len = folio_size(folio);
327 u64 off = folio_file_pos(folio);
Jeff Laytonf0702872020-06-01 10:10:21 -0400328
329 if (ci->i_inline_version != CEPH_INLINE_NONE) {
330 /*
331 * Uptodate inline data should have been added
332 * into page cache while getting Fcr caps.
333 */
334 if (off == 0) {
David Howells78525c72021-08-11 09:49:13 +0100335 folio_unlock(folio);
Jeff Laytonf0702872020-06-01 10:10:21 -0400336 return -EINVAL;
337 }
David Howells78525c72021-08-11 09:49:13 +0100338 zero_user_segment(&folio->page, 0, folio_size(folio));
339 folio_mark_uptodate(folio);
340 folio_unlock(folio);
Jeff Laytonf0702872020-06-01 10:10:21 -0400341 return 0;
342 }
343
David Howells78525c72021-08-11 09:49:13 +0100344 dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
345 vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
Jeff Laytonf0702872020-06-01 10:10:21 -0400346
David Howells78525c72021-08-11 09:49:13 +0100347 return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
Jeff Laytonf0702872020-06-01 10:10:21 -0400348}
349
Jeff Layton49870052020-07-09 14:43:23 -0400350static void ceph_readahead(struct readahead_control *ractl)
Sage Weil1d3576f2009-10-06 11:31:09 -0700351{
Jeff Layton49870052020-07-09 14:43:23 -0400352 struct inode *inode = file_inode(ractl->file);
353 struct ceph_file_info *fi = ractl->file->private_data;
354 struct ceph_rw_context *rw_ctx;
Yan, Zheng2b1ac852016-10-25 10:51:55 +0800355 int got = 0;
356 int ret = 0;
357
Yan, Zheng83701242014-11-14 22:36:18 +0800358 if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE)
Jeff Layton49870052020-07-09 14:43:23 -0400359 return;
Yan, Zheng83701242014-11-14 22:36:18 +0800360
Chengguang Xu73737682018-02-28 19:43:47 +0800361 rw_ctx = ceph_find_rw_context(fi);
Jeff Layton49870052020-07-09 14:43:23 -0400362 if (!rw_ctx) {
363 /*
364 * readahead callers do not necessarily hold Fcb caps
365 * (e.g. fadvise, madvise).
366 */
367 int want = CEPH_CAP_FILE_CACHE;
368
369 ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got);
370 if (ret < 0)
371 dout("start_read %p, error getting cap\n", inode);
372 else if (!(got & want))
373 dout("start_read %p, no cache cap\n", inode);
374
375 if (ret <= 0)
376 return;
Sage Weil1d3576f2009-10-06 11:31:09 -0700377 }
Jeff Layton49870052020-07-09 14:43:23 -0400378 netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got);
Sage Weil1d3576f2009-10-06 11:31:09 -0700379}
380
Yan, Zheng1f934b02017-08-30 11:36:06 +0800381struct ceph_writeback_ctl
382{
383 loff_t i_size;
384 u64 truncate_size;
385 u32 truncate_seq;
386 bool size_stable;
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800387 bool head_snapc;
Yan, Zheng1f934b02017-08-30 11:36:06 +0800388};
389
Sage Weil1d3576f2009-10-06 11:31:09 -0700390/*
391 * Get ref for the oldest snapc for an inode with dirty data... that is, the
392 * only snap context we are allowed to write back.
Sage Weil1d3576f2009-10-06 11:31:09 -0700393 */
Yan, Zheng1f934b02017-08-30 11:36:06 +0800394static struct ceph_snap_context *
Yan, Zheng05455e12017-09-02 10:50:48 +0800395get_oldest_context(struct inode *inode, struct ceph_writeback_ctl *ctl,
396 struct ceph_snap_context *page_snapc)
Sage Weil1d3576f2009-10-06 11:31:09 -0700397{
398 struct ceph_inode_info *ci = ceph_inode(inode);
399 struct ceph_snap_context *snapc = NULL;
400 struct ceph_cap_snap *capsnap = NULL;
401
Sage Weilbe655592011-11-30 09:47:09 -0800402 spin_lock(&ci->i_ceph_lock);
Sage Weil1d3576f2009-10-06 11:31:09 -0700403 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
404 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
405 capsnap->context, capsnap->dirty_pages);
Yan, Zheng05455e12017-09-02 10:50:48 +0800406 if (!capsnap->dirty_pages)
407 continue;
408
409 /* get i_size, truncate_{seq,size} for page_snapc? */
410 if (snapc && capsnap->context != page_snapc)
411 continue;
412
413 if (ctl) {
414 if (capsnap->writing) {
415 ctl->i_size = i_size_read(inode);
416 ctl->size_stable = false;
417 } else {
418 ctl->i_size = capsnap->size;
419 ctl->size_stable = true;
Yan, Zheng1f934b02017-08-30 11:36:06 +0800420 }
Yan, Zheng05455e12017-09-02 10:50:48 +0800421 ctl->truncate_size = capsnap->truncate_size;
422 ctl->truncate_seq = capsnap->truncate_seq;
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800423 ctl->head_snapc = false;
Sage Weil1d3576f2009-10-06 11:31:09 -0700424 }
Yan, Zheng05455e12017-09-02 10:50:48 +0800425
426 if (snapc)
427 break;
428
429 snapc = ceph_get_snap_context(capsnap->context);
430 if (!page_snapc ||
431 page_snapc == snapc ||
432 page_snapc->seq > snapc->seq)
433 break;
Sage Weil1d3576f2009-10-06 11:31:09 -0700434 }
Sage Weil7d8cb262010-08-24 08:44:16 -0700435 if (!snapc && ci->i_wrbuffer_ref_head) {
Sage Weil80e755f2010-03-31 21:52:10 -0700436 snapc = ceph_get_snap_context(ci->i_head_snapc);
Sage Weil1d3576f2009-10-06 11:31:09 -0700437 dout(" head snapc %p has %d dirty pages\n",
438 snapc, ci->i_wrbuffer_ref_head);
Yan, Zheng1f934b02017-08-30 11:36:06 +0800439 if (ctl) {
440 ctl->i_size = i_size_read(inode);
441 ctl->truncate_size = ci->i_truncate_size;
442 ctl->truncate_seq = ci->i_truncate_seq;
443 ctl->size_stable = false;
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800444 ctl->head_snapc = true;
Yan, Zheng1f934b02017-08-30 11:36:06 +0800445 }
Sage Weil1d3576f2009-10-06 11:31:09 -0700446 }
Sage Weilbe655592011-11-30 09:47:09 -0800447 spin_unlock(&ci->i_ceph_lock);
Sage Weil1d3576f2009-10-06 11:31:09 -0700448 return snapc;
449}
450
Yan, Zheng1f934b02017-08-30 11:36:06 +0800451static u64 get_writepages_data_length(struct inode *inode,
452 struct page *page, u64 start)
453{
454 struct ceph_inode_info *ci = ceph_inode(inode);
455 struct ceph_snap_context *snapc = page_snap_context(page);
456 struct ceph_cap_snap *capsnap = NULL;
457 u64 end = i_size_read(inode);
458
459 if (snapc != ci->i_head_snapc) {
460 bool found = false;
461 spin_lock(&ci->i_ceph_lock);
462 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
463 if (capsnap->context == snapc) {
464 if (!capsnap->writing)
465 end = capsnap->size;
466 found = true;
467 break;
468 }
469 }
470 spin_unlock(&ci->i_ceph_lock);
471 WARN_ON(!found);
472 }
Jeff Layton8ff2d292021-04-05 10:40:56 -0400473 if (end > page_offset(page) + thp_size(page))
474 end = page_offset(page) + thp_size(page);
Yan, Zheng1f934b02017-08-30 11:36:06 +0800475 return end > start ? end - start : 0;
476}
477
Sage Weil1d3576f2009-10-06 11:31:09 -0700478/*
479 * Write a single page, but leave the page locked.
480 *
Jeff Laytonb72b13e2019-07-02 12:35:52 -0400481 * If we get a write error, mark the mapping for error, but still adjust the
Sage Weil1d3576f2009-10-06 11:31:09 -0700482 * dirty page accounting (i.e., page is no longer dirty).
483 */
484static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
485{
Jeff Layton63909872020-07-14 14:37:15 -0400486 struct inode *inode = page->mapping->host;
487 struct ceph_inode_info *ci = ceph_inode(inode);
488 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
Sage Weil6298a332010-03-31 22:01:38 -0700489 struct ceph_snap_context *snapc, *oldest;
Yan, Zhengfc2744a2013-05-31 16:48:29 +0800490 loff_t page_off = page_offset(page);
Jeff Layton63909872020-07-14 14:37:15 -0400491 int err;
Jeff Layton8ff2d292021-04-05 10:40:56 -0400492 loff_t len = thp_size(page);
Yan, Zheng1f934b02017-08-30 11:36:06 +0800493 struct ceph_writeback_ctl ceph_wbc;
Jeff Layton63909872020-07-14 14:37:15 -0400494 struct ceph_osd_client *osdc = &fsc->client->osdc;
495 struct ceph_osd_request *req;
Sage Weil1d3576f2009-10-06 11:31:09 -0700496
497 dout("writepage %p idx %lu\n", page, page->index);
498
Sage Weil1d3576f2009-10-06 11:31:09 -0700499 /* verify this is a writeable snap context */
Yan, Zheng61600ef2012-05-28 14:44:30 +0800500 snapc = page_snap_context(page);
Markus Elfringd37b1d92017-08-20 20:22:02 +0200501 if (!snapc) {
Sage Weil1d3576f2009-10-06 11:31:09 -0700502 dout("writepage %p page %p not dirty?\n", inode, page);
Yan, Zheng43986882017-05-23 17:48:28 +0800503 return 0;
Sage Weil1d3576f2009-10-06 11:31:09 -0700504 }
Yan, Zheng05455e12017-09-02 10:50:48 +0800505 oldest = get_oldest_context(inode, &ceph_wbc, snapc);
Sage Weil6298a332010-03-31 22:01:38 -0700506 if (snapc->seq > oldest->seq) {
Sage Weil1d3576f2009-10-06 11:31:09 -0700507 dout("writepage %p page %p snapc %p not writeable - noop\n",
Yan, Zheng61600ef2012-05-28 14:44:30 +0800508 inode, page, snapc);
Sage Weil1d3576f2009-10-06 11:31:09 -0700509 /* we should only noop if called by kswapd */
Yan, Zhengfa71fef2017-05-23 17:18:53 +0800510 WARN_ON(!(current->flags & PF_MEMALLOC));
Sage Weil6298a332010-03-31 22:01:38 -0700511 ceph_put_snap_context(oldest);
Yan, Zhengfa71fef2017-05-23 17:18:53 +0800512 redirty_page_for_writepage(wbc, page);
Yan, Zheng43986882017-05-23 17:48:28 +0800513 return 0;
Sage Weil1d3576f2009-10-06 11:31:09 -0700514 }
Sage Weil6298a332010-03-31 22:01:38 -0700515 ceph_put_snap_context(oldest);
Sage Weil1d3576f2009-10-06 11:31:09 -0700516
517 /* is this a partial page at end of file? */
Yan, Zheng1f934b02017-08-30 11:36:06 +0800518 if (page_off >= ceph_wbc.i_size) {
519 dout("%p page eof %llu\n", page, ceph_wbc.i_size);
Jeff Layton8ff2d292021-04-05 10:40:56 -0400520 page->mapping->a_ops->invalidatepage(page, 0, thp_size(page));
Yan, Zheng43986882017-05-23 17:48:28 +0800521 return 0;
Yan, Zhengfc2744a2013-05-31 16:48:29 +0800522 }
Yan, Zheng43986882017-05-23 17:48:28 +0800523
Yan, Zheng1f934b02017-08-30 11:36:06 +0800524 if (ceph_wbc.i_size < page_off + len)
525 len = ceph_wbc.i_size - page_off;
Sage Weil1d3576f2009-10-06 11:31:09 -0700526
Jeff Layton63909872020-07-14 14:37:15 -0400527 dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
Yan, Zheng1c0a9c22017-08-16 17:24:58 +0800528 inode, page, page->index, page_off, len, snapc, snapc->seq);
Sage Weil1d3576f2009-10-06 11:31:09 -0700529
Yan, Zheng314c4732017-12-15 16:57:40 +0800530 if (atomic_long_inc_return(&fsc->writeback_count) >
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700531 CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
Jan Kara09dc9fc2017-04-12 12:24:33 +0200532 set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
Yehuda Sadeh2baba252009-12-18 13:51:57 -0800533
Sage Weil1d3576f2009-10-06 11:31:09 -0700534 set_page_writeback(page);
Jeff Layton63909872020-07-14 14:37:15 -0400535 req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
536 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
537 ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
538 true);
539 if (IS_ERR(req)) {
540 redirty_page_for_writepage(wbc, page);
541 end_page_writeback(page);
542 return PTR_ERR(req);
543 }
544
545 /* it may be a short write due to an object boundary */
Jeff Layton8ff2d292021-04-05 10:40:56 -0400546 WARN_ON_ONCE(len > thp_size(page));
Jeff Layton63909872020-07-14 14:37:15 -0400547 osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
548 dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
549
550 req->r_mtime = inode->i_mtime;
551 err = ceph_osdc_start_request(osdc, req, true);
552 if (!err)
553 err = ceph_osdc_wait_request(osdc, req);
554
Xiubo Li8ae99ae2021-03-22 20:28:49 +0800555 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
Xiubo Li903f4fec2021-05-13 09:40:53 +0800556 req->r_end_latency, len, err);
Jeff Layton63909872020-07-14 14:37:15 -0400557
558 ceph_osdc_put_request(req);
559 if (err == 0)
560 err = len;
561
Sage Weil1d3576f2009-10-06 11:31:09 -0700562 if (err < 0) {
Yan, Zhengad15ec02016-05-13 17:29:51 +0800563 struct writeback_control tmp_wbc;
564 if (!wbc)
565 wbc = &tmp_wbc;
566 if (err == -ERESTARTSYS) {
567 /* killed by SIGKILL */
568 dout("writepage interrupted page %p\n", page);
569 redirty_page_for_writepage(wbc, page);
570 end_page_writeback(page);
Yan, Zheng43986882017-05-23 17:48:28 +0800571 return err;
Yan, Zhengad15ec02016-05-13 17:29:51 +0800572 }
Ilya Dryomov0b98acd2020-09-14 13:39:19 +0200573 if (err == -EBLOCKLISTED)
574 fsc->blocklisted = true;
Yan, Zhengad15ec02016-05-13 17:29:51 +0800575 dout("writepage setting page/mapping error %d %p\n",
576 err, page);
Sage Weil1d3576f2009-10-06 11:31:09 -0700577 mapping_set_error(&inode->i_data, err);
Yan, Zhengad15ec02016-05-13 17:29:51 +0800578 wbc->pages_skipped++;
Sage Weil1d3576f2009-10-06 11:31:09 -0700579 } else {
580 dout("writepage cleaned page %p\n", page);
581 err = 0; /* vfs expects us to return 0 */
582 }
Jeff Layton379fc7f2021-03-23 15:16:52 -0400583 oldest = detach_page_private(page);
584 WARN_ON_ONCE(oldest != snapc);
Sage Weil1d3576f2009-10-06 11:31:09 -0700585 end_page_writeback(page);
586 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
Sage Weil6298a332010-03-31 22:01:38 -0700587 ceph_put_snap_context(snapc); /* page's reference */
Yan, Zheng314c4732017-12-15 16:57:40 +0800588
589 if (atomic_long_dec_return(&fsc->writeback_count) <
590 CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
591 clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
592
Sage Weil1d3576f2009-10-06 11:31:09 -0700593 return err;
594}
595
596static int ceph_writepage(struct page *page, struct writeback_control *wbc)
597{
Yehuda Sadehdbd646a2009-12-16 14:51:06 -0800598 int err;
599 struct inode *inode = page->mapping->host;
600 BUG_ON(!inode);
Sage Weil70b666c2011-05-27 09:24:26 -0700601 ihold(inode);
Yehuda Sadehdbd646a2009-12-16 14:51:06 -0800602 err = writepage_nounlock(page, wbc);
Yan, Zhengad15ec02016-05-13 17:29:51 +0800603 if (err == -ERESTARTSYS) {
604 /* direct memory reclaimer was killed by SIGKILL. return 0
605 * to prevent caller from setting mapping/page error */
606 err = 0;
607 }
Sage Weil1d3576f2009-10-06 11:31:09 -0700608 unlock_page(page);
Yehuda Sadehdbd646a2009-12-16 14:51:06 -0800609 iput(inode);
Sage Weil1d3576f2009-10-06 11:31:09 -0700610 return err;
611}
612
Sage Weil1d3576f2009-10-06 11:31:09 -0700613/*
Sage Weil1d3576f2009-10-06 11:31:09 -0700614 * async writeback completion handler.
615 *
616 * If we get an error, set the mapping error bit, but not the individual
617 * page error bits.
618 */
Ilya Dryomov85e084f2016-04-28 16:07:24 +0200619static void writepages_finish(struct ceph_osd_request *req)
Sage Weil1d3576f2009-10-06 11:31:09 -0700620{
621 struct inode *inode = req->r_inode;
Sage Weil1d3576f2009-10-06 11:31:09 -0700622 struct ceph_inode_info *ci = ceph_inode(inode);
Alex Elder87060c12013-04-03 01:28:58 -0500623 struct ceph_osd_data *osd_data;
Sage Weil1d3576f2009-10-06 11:31:09 -0700624 struct page *page;
Yan, Zheng5b646402016-01-07 16:00:17 +0800625 int num_pages, total_pages = 0;
626 int i, j;
627 int rc = req->r_result;
Sage Weil1d3576f2009-10-06 11:31:09 -0700628 struct ceph_snap_context *snapc = req->r_snapc;
629 struct address_space *mapping = inode->i_mapping;
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700630 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
Xiubo Li903f4fec2021-05-13 09:40:53 +0800631 unsigned int len = 0;
Yan, Zheng5b646402016-01-07 16:00:17 +0800632 bool remove_page;
Sage Weil1d3576f2009-10-06 11:31:09 -0700633
Yan, Zheng5b646402016-01-07 16:00:17 +0800634 dout("writepages_finish %p rc %d\n", inode, rc);
Jeff Layton26544c622017-04-04 08:39:46 -0400635 if (rc < 0) {
Sage Weil1d3576f2009-10-06 11:31:09 -0700636 mapping_set_error(mapping, rc);
Jeff Layton26544c622017-04-04 08:39:46 -0400637 ceph_set_error_write(ci);
Ilya Dryomov0b98acd2020-09-14 13:39:19 +0200638 if (rc == -EBLOCKLISTED)
639 fsc->blocklisted = true;
Jeff Layton26544c622017-04-04 08:39:46 -0400640 } else {
641 ceph_clear_error_write(ci);
642 }
Yan, Zheng5b646402016-01-07 16:00:17 +0800643
644 /*
645 * We lost the cache cap, need to truncate the page before
646 * it is unlocked, otherwise we'd truncate it later in the
647 * page truncation thread, possibly losing some data that
648 * raced its way in
649 */
650 remove_page = !(ceph_caps_issued(ci) &
651 (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
Sage Weil1d3576f2009-10-06 11:31:09 -0700652
653 /* clean all pages */
Yan, Zheng5b646402016-01-07 16:00:17 +0800654 for (i = 0; i < req->r_num_ops; i++) {
655 if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
656 break;
Sage Weil1d3576f2009-10-06 11:31:09 -0700657
Yan, Zheng5b646402016-01-07 16:00:17 +0800658 osd_data = osd_req_op_extent_osd_data(req, i);
659 BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
Xiubo Li903f4fec2021-05-13 09:40:53 +0800660 len += osd_data->length;
Yan, Zheng5b646402016-01-07 16:00:17 +0800661 num_pages = calc_pages_for((u64)osd_data->alignment,
662 (u64)osd_data->length);
663 total_pages += num_pages;
664 for (j = 0; j < num_pages; j++) {
665 page = osd_data->pages[j];
666 BUG_ON(!page);
667 WARN_ON(!PageUptodate(page));
Yehuda Sadeh2baba252009-12-18 13:51:57 -0800668
Yan, Zheng5b646402016-01-07 16:00:17 +0800669 if (atomic_long_dec_return(&fsc->writeback_count) <
670 CONGESTION_OFF_THRESH(
671 fsc->mount_options->congestion_kb))
Jan Kara09dc9fc2017-04-12 12:24:33 +0200672 clear_bdi_congested(inode_to_bdi(inode),
Yan, Zheng5b646402016-01-07 16:00:17 +0800673 BLK_RW_ASYNC);
Yehuda Sadehe63dc5c2010-02-19 00:07:01 +0000674
Jeff Layton379fc7f2021-03-23 15:16:52 -0400675 ceph_put_snap_context(detach_page_private(page));
Yan, Zheng5b646402016-01-07 16:00:17 +0800676 end_page_writeback(page);
Jeff Layton379fc7f2021-03-23 15:16:52 -0400677 dout("unlocking %p\n", page);
Yehuda Sadehe63dc5c2010-02-19 00:07:01 +0000678
Yan, Zheng5b646402016-01-07 16:00:17 +0800679 if (remove_page)
680 generic_error_remove_page(inode->i_mapping,
681 page);
682
683 unlock_page(page);
684 }
685 dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
686 inode, osd_data->length, rc >= 0 ? num_pages : 0);
687
John Hubbard96ac9152019-08-08 20:56:47 -0700688 release_pages(osd_data->pages, num_pages);
Sage Weil1d3576f2009-10-06 11:31:09 -0700689 }
Sage Weil1d3576f2009-10-06 11:31:09 -0700690
Xiubo Li903f4fec2021-05-13 09:40:53 +0800691 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
692 req->r_end_latency, len, rc);
693
Yan, Zheng5b646402016-01-07 16:00:17 +0800694 ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
695
696 osd_data = osd_req_op_extent_osd_data(req, 0);
Alex Elder87060c12013-04-03 01:28:58 -0500697 if (osd_data->pages_from_pool)
Jeff Laytona0102bd2020-07-30 11:03:55 -0400698 mempool_free(osd_data->pages, ceph_wb_pagevec_pool);
Sage Weil1d3576f2009-10-06 11:31:09 -0700699 else
Alex Elder87060c12013-04-03 01:28:58 -0500700 kfree(osd_data->pages);
Sage Weil1d3576f2009-10-06 11:31:09 -0700701 ceph_osdc_put_request(req);
702}
703
Sage Weil1d3576f2009-10-06 11:31:09 -0700704/*
705 * initiate async writeback
706 */
707static int ceph_writepages_start(struct address_space *mapping,
708 struct writeback_control *wbc)
709{
710 struct inode *inode = mapping->host;
Sage Weil1d3576f2009-10-06 11:31:09 -0700711 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zhengfc2744a2013-05-31 16:48:29 +0800712 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
713 struct ceph_vino vino = ceph_vino(inode);
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800714 pgoff_t index, start_index, end = -1;
Sage Weil80e755f2010-03-31 21:52:10 -0700715 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
Sage Weil1d3576f2009-10-06 11:31:09 -0700716 struct pagevec pvec;
Sage Weil1d3576f2009-10-06 11:31:09 -0700717 int rc = 0;
Fabian Frederick93407472017-02-27 14:28:32 -0800718 unsigned int wsize = i_blocksize(inode);
Sage Weil1d3576f2009-10-06 11:31:09 -0700719 struct ceph_osd_request *req = NULL;
Yan, Zheng1f934b02017-08-30 11:36:06 +0800720 struct ceph_writeback_ctl ceph_wbc;
Yan, Zheng590e9d92017-09-03 00:04:31 +0800721 bool should_loop, range_whole = false;
Yan, Zhengaf9cc402018-03-04 16:36:01 +0800722 bool done = false;
Sage Weil1d3576f2009-10-06 11:31:09 -0700723
Yanhu Cao3fb99d42017-07-21 17:20:10 +0800724 dout("writepages_start %p (mode=%s)\n", inode,
Sage Weil1d3576f2009-10-06 11:31:09 -0700725 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
726 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
727
Jeff Layton50c91322020-09-25 07:55:39 -0400728 if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
Yan, Zheng6c93df52016-04-15 13:56:12 +0800729 if (ci->i_wrbuffer_ref > 0) {
730 pr_warn_ratelimited(
731 "writepage_start %p %lld forced umount\n",
732 inode, ceph_ino(inode));
733 }
Yan, Zhenga341d4d2015-07-01 17:03:23 +0800734 mapping_set_error(mapping, -EIO);
Sage Weil1d3576f2009-10-06 11:31:09 -0700735 return -EIO; /* we're in a forced umount, don't write! */
736 }
Yan, Zheng95cca2b2017-07-11 17:34:46 +0800737 if (fsc->mount_options->wsize < wsize)
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700738 wsize = fsc->mount_options->wsize;
Sage Weil1d3576f2009-10-06 11:31:09 -0700739
Mel Gorman86679822017-11-15 17:37:52 -0800740 pagevec_init(&pvec);
Sage Weil1d3576f2009-10-06 11:31:09 -0700741
Yan, Zheng590e9d92017-09-03 00:04:31 +0800742 start_index = wbc->range_cyclic ? mapping->writeback_index : 0;
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800743 index = start_index;
Sage Weil1d3576f2009-10-06 11:31:09 -0700744
745retry:
746 /* find oldest snap context with dirty data */
Yan, Zheng05455e12017-09-02 10:50:48 +0800747 snapc = get_oldest_context(inode, &ceph_wbc, NULL);
Sage Weil1d3576f2009-10-06 11:31:09 -0700748 if (!snapc) {
749 /* hmm, why does writepages get called when there
750 is no dirty data? */
751 dout(" no snap context with dirty data?\n");
752 goto out;
753 }
754 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
755 snapc, snapc->seq, snapc->num_snaps);
Yan, Zhengfc2744a2013-05-31 16:48:29 +0800756
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800757 should_loop = false;
758 if (ceph_wbc.head_snapc && snapc != last_snapc) {
759 /* where to start/end? */
760 if (wbc->range_cyclic) {
761 index = start_index;
762 end = -1;
763 if (index > 0)
764 should_loop = true;
765 dout(" cyclic, start at %lu\n", index);
766 } else {
767 index = wbc->range_start >> PAGE_SHIFT;
768 end = wbc->range_end >> PAGE_SHIFT;
769 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
770 range_whole = true;
771 dout(" not cyclic, %lu to %lu\n", index, end);
772 }
773 } else if (!ceph_wbc.head_snapc) {
774 /* Do not respect wbc->range_{start,end}. Dirty pages
775 * in that range can be associated with newer snapc.
776 * They are not writeable until we write all dirty pages
777 * associated with 'snapc' get written */
Yan, Zheng1582af22018-03-06 15:14:54 +0800778 if (index > 0)
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800779 should_loop = true;
780 dout(" non-head snapc, range whole\n");
Sage Weil1d3576f2009-10-06 11:31:09 -0700781 }
Yan, Zheng2a2d9272017-09-01 16:53:58 +0800782
783 ceph_put_snap_context(last_snapc);
Sage Weil1d3576f2009-10-06 11:31:09 -0700784 last_snapc = snapc;
785
Yan, Zhengaf9cc402018-03-04 16:36:01 +0800786 while (!done && index <= end) {
Yan, Zheng5b646402016-01-07 16:00:17 +0800787 int num_ops = 0, op_idx;
Yan, Zheng0e5ecac2017-08-31 19:20:40 +0800788 unsigned i, pvec_pages, max_pages, locked_pages = 0;
Yan, Zheng5b646402016-01-07 16:00:17 +0800789 struct page **pages = NULL, **data_pages;
Sage Weil1d3576f2009-10-06 11:31:09 -0700790 struct page *page;
Yan, Zheng0e5ecac2017-08-31 19:20:40 +0800791 pgoff_t strip_unit_end = 0;
Yan, Zheng5b646402016-01-07 16:00:17 +0800792 u64 offset = 0, len = 0;
Jeff Laytona0102bd2020-07-30 11:03:55 -0400793 bool from_pool = false;
Sage Weil1d3576f2009-10-06 11:31:09 -0700794
Yan, Zheng0e5ecac2017-08-31 19:20:40 +0800795 max_pages = wsize >> PAGE_SHIFT;
Sage Weil1d3576f2009-10-06 11:31:09 -0700796
797get_more_pages:
Jeff Layton2e169292020-09-14 13:30:36 -0400798 pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index,
799 end, PAGECACHE_TAG_DIRTY);
Jan Kara0ed75fc2017-11-15 17:34:41 -0800800 dout("pagevec_lookup_range_tag got %d\n", pvec_pages);
Sage Weil1d3576f2009-10-06 11:31:09 -0700801 if (!pvec_pages && !locked_pages)
802 break;
803 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
804 page = pvec.pages[i];
805 dout("? %p idx %lu\n", page, page->index);
806 if (locked_pages == 0)
807 lock_page(page); /* first page */
808 else if (!trylock_page(page))
809 break;
810
811 /* only dirty pages, or our accounting breaks */
812 if (unlikely(!PageDirty(page)) ||
813 unlikely(page->mapping != mapping)) {
814 dout("!dirty or !mapping %p\n", page);
815 unlock_page(page);
Yan, Zheng0713e5f2017-08-31 16:55:48 +0800816 continue;
Sage Weil1d3576f2009-10-06 11:31:09 -0700817 }
Yan, Zhengaf9cc402018-03-04 16:36:01 +0800818 /* only if matching snap context */
819 pgsnapc = page_snap_context(page);
820 if (pgsnapc != snapc) {
821 dout("page snapc %p %lld != oldest %p %lld\n",
822 pgsnapc, pgsnapc->seq, snapc, snapc->seq);
Yan, Zheng1582af22018-03-06 15:14:54 +0800823 if (!should_loop &&
824 !ceph_wbc.head_snapc &&
825 wbc->sync_mode != WB_SYNC_NONE)
826 should_loop = true;
Sage Weil1d3576f2009-10-06 11:31:09 -0700827 unlock_page(page);
Yan, Zhengaf9cc402018-03-04 16:36:01 +0800828 continue;
Sage Weil1d3576f2009-10-06 11:31:09 -0700829 }
Yan, Zheng1f934b02017-08-30 11:36:06 +0800830 if (page_offset(page) >= ceph_wbc.i_size) {
831 dout("%p page eof %llu\n",
832 page, ceph_wbc.i_size);
Erqi Chenc95f1c52019-07-24 10:26:09 +0800833 if ((ceph_wbc.size_stable ||
834 page_offset(page) >= i_size_read(inode)) &&
835 clear_page_dirty_for_io(page))
Yan, Zhengaf9cc402018-03-04 16:36:01 +0800836 mapping->a_ops->invalidatepage(page,
Jeff Layton8ff2d292021-04-05 10:40:56 -0400837 0, thp_size(page));
Yan, Zhengaf9cc402018-03-04 16:36:01 +0800838 unlock_page(page);
839 continue;
840 }
841 if (strip_unit_end && (page->index > strip_unit_end)) {
842 dout("end of strip unit %p\n", page);
Sage Weil1d3576f2009-10-06 11:31:09 -0700843 unlock_page(page);
844 break;
845 }
846 if (PageWriteback(page)) {
Yan, Zheng0713e5f2017-08-31 16:55:48 +0800847 if (wbc->sync_mode == WB_SYNC_NONE) {
848 dout("%p under writeback\n", page);
849 unlock_page(page);
850 continue;
851 }
852 dout("waiting on writeback %p\n", page);
853 wait_on_page_writeback(page);
Sage Weil1d3576f2009-10-06 11:31:09 -0700854 }
855
Sage Weil1d3576f2009-10-06 11:31:09 -0700856 if (!clear_page_dirty_for_io(page)) {
857 dout("%p !clear_page_dirty_for_io\n", page);
858 unlock_page(page);
Yan, Zheng0713e5f2017-08-31 16:55:48 +0800859 continue;
Sage Weil1d3576f2009-10-06 11:31:09 -0700860 }
861
Alex Eldere5975c72013-03-14 14:09:05 -0500862 /*
863 * We have something to write. If this is
864 * the first locked page this time through,
Yan, Zheng5b646402016-01-07 16:00:17 +0800865 * calculate max possinle write size and
866 * allocate a page array
Alex Eldere5975c72013-03-14 14:09:05 -0500867 */
Sage Weil1d3576f2009-10-06 11:31:09 -0700868 if (locked_pages == 0) {
Yan, Zheng5b646402016-01-07 16:00:17 +0800869 u64 objnum;
870 u64 objoff;
Ilya Dryomovdccbf082018-02-17 09:29:58 +0100871 u32 xlen;
Yan, Zheng5b646402016-01-07 16:00:17 +0800872
Sage Weil1d3576f2009-10-06 11:31:09 -0700873 /* prepare async write request */
Alex Eldere5975c72013-03-14 14:09:05 -0500874 offset = (u64)page_offset(page);
Ilya Dryomovdccbf082018-02-17 09:29:58 +0100875 ceph_calc_file_object_mapping(&ci->i_layout,
876 offset, wsize,
877 &objnum, &objoff,
878 &xlen);
879 len = xlen;
Henry C Chang8c718972011-05-03 09:45:16 +0000880
Yanhu Cao3fb99d42017-07-21 17:20:10 +0800881 num_ops = 1;
Yan, Zheng5b646402016-01-07 16:00:17 +0800882 strip_unit_end = page->index +
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +0300883 ((len - 1) >> PAGE_SHIFT);
Yan, Zheng715e4cd2014-11-13 14:40:37 +0800884
Yan, Zheng5b646402016-01-07 16:00:17 +0800885 BUG_ON(pages);
Alex Elder88486952013-03-14 14:09:05 -0500886 max_pages = calc_pages_for(0, (u64)len);
Kees Cook6da2ec52018-06-12 13:55:00 -0700887 pages = kmalloc_array(max_pages,
888 sizeof(*pages),
889 GFP_NOFS);
Alex Elder88486952013-03-14 14:09:05 -0500890 if (!pages) {
Jeff Laytona0102bd2020-07-30 11:03:55 -0400891 from_pool = true;
892 pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
Alex Eldere5975c72013-03-14 14:09:05 -0500893 BUG_ON(!pages);
Alex Elder88486952013-03-14 14:09:05 -0500894 }
Yan, Zheng5b646402016-01-07 16:00:17 +0800895
896 len = 0;
897 } else if (page->index !=
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +0300898 (offset + len) >> PAGE_SHIFT) {
Jeff Laytona0102bd2020-07-30 11:03:55 -0400899 if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS :
900 CEPH_OSD_MAX_OPS)) {
Yan, Zheng5b646402016-01-07 16:00:17 +0800901 redirty_page_for_writepage(wbc, page);
902 unlock_page(page);
903 break;
904 }
905
906 num_ops++;
907 offset = (u64)page_offset(page);
908 len = 0;
Sage Weil1d3576f2009-10-06 11:31:09 -0700909 }
910
911 /* note position of first page in pvec */
Sage Weil1d3576f2009-10-06 11:31:09 -0700912 dout("%p will write page %p idx %lu\n",
913 inode, page, page->index);
Yehuda Sadeh2baba252009-12-18 13:51:57 -0800914
Yan, Zheng5b646402016-01-07 16:00:17 +0800915 if (atomic_long_inc_return(&fsc->writeback_count) >
916 CONGESTION_ON_THRESH(
Yehuda Sadeh3d14c5d2010-04-06 15:14:15 -0700917 fsc->mount_options->congestion_kb)) {
Jan Kara09dc9fc2017-04-12 12:24:33 +0200918 set_bdi_congested(inode_to_bdi(inode),
Sage Weil213c99e2010-08-03 10:25:11 -0700919 BLK_RW_ASYNC);
Yehuda Sadeh2baba252009-12-18 13:51:57 -0800920 }
921
Yan, Zheng0713e5f2017-08-31 16:55:48 +0800922
923 pages[locked_pages++] = page;
924 pvec.pages[i] = NULL;
925
Jeff Layton8ff2d292021-04-05 10:40:56 -0400926 len += thp_size(page);
Sage Weil1d3576f2009-10-06 11:31:09 -0700927 }
928
929 /* did we get anything? */
930 if (!locked_pages)
931 goto release_pvec_pages;
932 if (i) {
Yan, Zheng0713e5f2017-08-31 16:55:48 +0800933 unsigned j, n = 0;
934 /* shift unused page to beginning of pvec */
935 for (j = 0; j < pvec_pages; j++) {
936 if (!pvec.pages[j])
937 continue;
938 if (n < j)
939 pvec.pages[n] = pvec.pages[j];
940 n++;
941 }
942 pvec.nr = n;
Sage Weil1d3576f2009-10-06 11:31:09 -0700943
944 if (pvec_pages && i == pvec_pages &&
945 locked_pages < max_pages) {
946 dout("reached end pvec, trying for more\n");
Yan, Zheng0713e5f2017-08-31 16:55:48 +0800947 pagevec_release(&pvec);
Sage Weil1d3576f2009-10-06 11:31:09 -0700948 goto get_more_pages;
949 }
Sage Weil1d3576f2009-10-06 11:31:09 -0700950 }
951
Yan, Zheng5b646402016-01-07 16:00:17 +0800952new_request:
Alex Eldere5975c72013-03-14 14:09:05 -0500953 offset = page_offset(pages[0]);
Yan, Zheng5b646402016-01-07 16:00:17 +0800954 len = wsize;
955
956 req = ceph_osdc_new_request(&fsc->client->osdc,
957 &ci->i_layout, vino,
958 offset, &len, 0, num_ops,
Yan, Zheng1f934b02017-08-30 11:36:06 +0800959 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
960 snapc, ceph_wbc.truncate_seq,
961 ceph_wbc.truncate_size, false);
Yan, Zheng5b646402016-01-07 16:00:17 +0800962 if (IS_ERR(req)) {
963 req = ceph_osdc_new_request(&fsc->client->osdc,
964 &ci->i_layout, vino,
965 offset, &len, 0,
966 min(num_ops,
967 CEPH_OSD_SLAB_OPS),
968 CEPH_OSD_OP_WRITE,
Ilya Dryomov54ea0042017-02-11 18:48:41 +0100969 CEPH_OSD_FLAG_WRITE,
Yan, Zheng1f934b02017-08-30 11:36:06 +0800970 snapc, ceph_wbc.truncate_seq,
971 ceph_wbc.truncate_size, true);
Yan, Zheng5b646402016-01-07 16:00:17 +0800972 BUG_ON(IS_ERR(req));
Yan, Zhenge1966b42015-06-18 03:10:58 +0800973 }
Yan, Zheng5b646402016-01-07 16:00:17 +0800974 BUG_ON(len < page_offset(pages[locked_pages - 1]) +
Jeff Layton8ff2d292021-04-05 10:40:56 -0400975 thp_size(page) - offset);
Sage Weil1d3576f2009-10-06 11:31:09 -0700976
Yan, Zheng5b646402016-01-07 16:00:17 +0800977 req->r_callback = writepages_finish;
978 req->r_inode = inode;
979
980 /* Format the osd request message and submit the write */
981 len = 0;
982 data_pages = pages;
983 op_idx = 0;
984 for (i = 0; i < locked_pages; i++) {
985 u64 cur_offset = page_offset(pages[i]);
986 if (offset + len != cur_offset) {
Yanhu Cao3fb99d42017-07-21 17:20:10 +0800987 if (op_idx + 1 == req->r_num_ops)
Yan, Zheng5b646402016-01-07 16:00:17 +0800988 break;
989 osd_req_op_extent_dup_last(req, op_idx,
990 cur_offset - offset);
991 dout("writepages got pages at %llu~%llu\n",
992 offset, len);
993 osd_req_op_extent_osd_data_pages(req, op_idx,
994 data_pages, len, 0,
Jeff Laytona0102bd2020-07-30 11:03:55 -0400995 from_pool, false);
Yan, Zheng5b646402016-01-07 16:00:17 +0800996 osd_req_op_extent_update(req, op_idx, len);
Alex Eldere5975c72013-03-14 14:09:05 -0500997
Yan, Zheng5b646402016-01-07 16:00:17 +0800998 len = 0;
999 offset = cur_offset;
1000 data_pages = pages + i;
1001 op_idx++;
1002 }
1003
1004 set_page_writeback(pages[i]);
Jeff Layton8ff2d292021-04-05 10:40:56 -04001005 len += thp_size(page);
Yan, Zheng5b646402016-01-07 16:00:17 +08001006 }
1007
Yan, Zheng1f934b02017-08-30 11:36:06 +08001008 if (ceph_wbc.size_stable) {
1009 len = min(len, ceph_wbc.i_size - offset);
Yan, Zheng5b646402016-01-07 16:00:17 +08001010 } else if (i == locked_pages) {
1011 /* writepages_finish() clears writeback pages
1012 * according to the data length, so make sure
1013 * data length covers all locked pages */
Jeff Layton8ff2d292021-04-05 10:40:56 -04001014 u64 min_len = len + 1 - thp_size(page);
Yan, Zheng1f934b02017-08-30 11:36:06 +08001015 len = get_writepages_data_length(inode, pages[i - 1],
1016 offset);
Yan, Zheng5b646402016-01-07 16:00:17 +08001017 len = max(len, min_len);
1018 }
1019 dout("writepages got pages at %llu~%llu\n", offset, len);
1020
1021 osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
Jeff Laytona0102bd2020-07-30 11:03:55 -04001022 0, from_pool, false);
Yan, Zheng5b646402016-01-07 16:00:17 +08001023 osd_req_op_extent_update(req, op_idx, len);
1024
Yan, Zheng5b646402016-01-07 16:00:17 +08001025 BUG_ON(op_idx + 1 != req->r_num_ops);
1026
Jeff Laytona0102bd2020-07-30 11:03:55 -04001027 from_pool = false;
Yan, Zheng5b646402016-01-07 16:00:17 +08001028 if (i < locked_pages) {
1029 BUG_ON(num_ops <= req->r_num_ops);
1030 num_ops -= req->r_num_ops;
Yan, Zheng5b646402016-01-07 16:00:17 +08001031 locked_pages -= i;
Alex Eldere5975c72013-03-14 14:09:05 -05001032
Yan, Zheng5b646402016-01-07 16:00:17 +08001033 /* allocate new pages array for next request */
1034 data_pages = pages;
Kees Cook6da2ec52018-06-12 13:55:00 -07001035 pages = kmalloc_array(locked_pages, sizeof(*pages),
1036 GFP_NOFS);
Yan, Zheng5b646402016-01-07 16:00:17 +08001037 if (!pages) {
Jeff Laytona0102bd2020-07-30 11:03:55 -04001038 from_pool = true;
1039 pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS);
Yan, Zheng5b646402016-01-07 16:00:17 +08001040 BUG_ON(!pages);
1041 }
1042 memcpy(pages, data_pages + i,
1043 locked_pages * sizeof(*pages));
1044 memset(data_pages + i, 0,
1045 locked_pages * sizeof(*pages));
1046 } else {
1047 BUG_ON(num_ops != req->r_num_ops);
1048 index = pages[i - 1]->index + 1;
1049 /* request message now owns the pages array */
1050 pages = NULL;
1051 }
Alex Eldere5975c72013-03-14 14:09:05 -05001052
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001053 req->r_mtime = inode->i_mtime;
Sage Weil9d6fcb02011-05-12 15:48:16 -07001054 rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
1055 BUG_ON(rc);
Sage Weil1d3576f2009-10-06 11:31:09 -07001056 req = NULL;
1057
Yan, Zheng5b646402016-01-07 16:00:17 +08001058 wbc->nr_to_write -= i;
1059 if (pages)
1060 goto new_request;
1061
Yan, Zheng2a2d9272017-09-01 16:53:58 +08001062 /*
1063 * We stop writing back only if we are not doing
1064 * integrity sync. In case of integrity sync we have to
1065 * keep going until we have written all the pages
1066 * we tagged for writeback prior to entering this loop.
1067 */
1068 if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE)
Yan, Zhengaf9cc402018-03-04 16:36:01 +08001069 done = true;
Sage Weil1d3576f2009-10-06 11:31:09 -07001070
1071release_pvec_pages:
1072 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
1073 pvec.nr ? pvec.pages[0] : NULL);
1074 pagevec_release(&pvec);
Sage Weil1d3576f2009-10-06 11:31:09 -07001075 }
1076
1077 if (should_loop && !done) {
1078 /* more to do; loop back to beginning of file */
1079 dout("writepages looping back to beginning of file\n");
Yan, Zheng2a2d9272017-09-01 16:53:58 +08001080 end = start_index - 1; /* OK even when start_index == 0 */
Yan, Zhengf2756352017-09-01 17:03:16 +08001081
1082 /* to write dirty pages associated with next snapc,
1083 * we need to wait until current writes complete */
1084 if (wbc->sync_mode != WB_SYNC_NONE &&
1085 start_index == 0 && /* all dirty pages were checked */
1086 !ceph_wbc.head_snapc) {
1087 struct page *page;
1088 unsigned i, nr;
1089 index = 0;
1090 while ((index <= end) &&
1091 (nr = pagevec_lookup_tag(&pvec, mapping, &index,
Jan Kara67fd7072017-11-15 17:35:19 -08001092 PAGECACHE_TAG_WRITEBACK))) {
Yan, Zhengf2756352017-09-01 17:03:16 +08001093 for (i = 0; i < nr; i++) {
1094 page = pvec.pages[i];
1095 if (page_snap_context(page) != snapc)
1096 continue;
1097 wait_on_page_writeback(page);
1098 }
1099 pagevec_release(&pvec);
1100 cond_resched();
1101 }
1102 }
1103
Yan, Zheng2a2d9272017-09-01 16:53:58 +08001104 start_index = 0;
Sage Weil1d3576f2009-10-06 11:31:09 -07001105 index = 0;
1106 goto retry;
1107 }
1108
1109 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
1110 mapping->writeback_index = index;
1111
1112out:
Ilya Dryomov3ed97d62016-04-26 15:05:29 +02001113 ceph_osdc_put_request(req);
Yan, Zheng2a2d9272017-09-01 16:53:58 +08001114 ceph_put_snap_context(last_snapc);
1115 dout("writepages dend - startone, rc = %d\n", rc);
Sage Weil1d3576f2009-10-06 11:31:09 -07001116 return rc;
1117}
1118
1119
1120
1121/*
1122 * See if a given @snapc is either writeable, or already written.
1123 */
1124static int context_is_writeable_or_written(struct inode *inode,
1125 struct ceph_snap_context *snapc)
1126{
Yan, Zheng05455e12017-09-02 10:50:48 +08001127 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL, NULL);
Sage Weil6298a332010-03-31 22:01:38 -07001128 int ret = !oldest || snapc->seq <= oldest->seq;
1129
1130 ceph_put_snap_context(oldest);
1131 return ret;
Sage Weil1d3576f2009-10-06 11:31:09 -07001132}
1133
Jeff Layton18d620f2020-05-28 13:56:54 -04001134/**
1135 * ceph_find_incompatible - find an incompatible context and return it
Jeff Layton18d620f2020-05-28 13:56:54 -04001136 * @page: page being dirtied
1137 *
1138 * We are only allowed to write into/dirty a page if the page is
1139 * clean, or already dirty within the same snap context. Returns a
1140 * conflicting context if there is one, NULL if there isn't, or a
1141 * negative error code on other errors.
1142 *
1143 * Must be called with page lock held.
1144 */
1145static struct ceph_snap_context *
Jeff Laytond45156b2020-05-28 14:59:49 -04001146ceph_find_incompatible(struct page *page)
Jeff Layton18d620f2020-05-28 13:56:54 -04001147{
Jeff Laytond45156b2020-05-28 14:59:49 -04001148 struct inode *inode = page->mapping->host;
Jeff Layton18d620f2020-05-28 13:56:54 -04001149 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1150 struct ceph_inode_info *ci = ceph_inode(inode);
1151
Jeff Layton50c91322020-09-25 07:55:39 -04001152 if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) {
Jeff Layton18d620f2020-05-28 13:56:54 -04001153 dout(" page %p forced umount\n", page);
1154 return ERR_PTR(-EIO);
1155 }
1156
1157 for (;;) {
1158 struct ceph_snap_context *snapc, *oldest;
1159
1160 wait_on_page_writeback(page);
1161
1162 snapc = page_snap_context(page);
1163 if (!snapc || snapc == ci->i_head_snapc)
1164 break;
1165
1166 /*
1167 * this page is already dirty in another (older) snap
1168 * context! is it writeable now?
1169 */
1170 oldest = get_oldest_context(inode, NULL, NULL);
1171 if (snapc->seq > oldest->seq) {
1172 /* not writeable -- return it for the caller to deal with */
1173 ceph_put_snap_context(oldest);
1174 dout(" page %p snapc %p not current or oldest\n", page, snapc);
1175 return ceph_get_snap_context(snapc);
1176 }
1177 ceph_put_snap_context(oldest);
1178
1179 /* yay, writeable, do it now (without dropping page lock) */
1180 dout(" page %p snapc %p not current, but oldest\n", page, snapc);
1181 if (clear_page_dirty_for_io(page)) {
1182 int r = writepage_nounlock(page, NULL);
1183 if (r < 0)
1184 return ERR_PTR(r);
1185 }
1186 }
1187 return NULL;
1188}
1189
Jeff Laytond8013272020-06-05 10:43:21 -04001190static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len,
David Howells78525c72021-08-11 09:49:13 +01001191 struct folio *folio, void **_fsdata)
Jeff Laytond8013272020-06-05 10:43:21 -04001192{
1193 struct inode *inode = file_inode(file);
1194 struct ceph_inode_info *ci = ceph_inode(inode);
1195 struct ceph_snap_context *snapc;
1196
David Howells78525c72021-08-11 09:49:13 +01001197 snapc = ceph_find_incompatible(folio_page(folio, 0));
Jeff Laytond8013272020-06-05 10:43:21 -04001198 if (snapc) {
1199 int r;
1200
David Howells78525c72021-08-11 09:49:13 +01001201 folio_unlock(folio);
1202 folio_put(folio);
Jeff Laytond8013272020-06-05 10:43:21 -04001203 if (IS_ERR(snapc))
1204 return PTR_ERR(snapc);
1205
1206 ceph_queue_writeback(inode);
1207 r = wait_event_killable(ci->i_cap_wq,
1208 context_is_writeable_or_written(inode, snapc));
1209 ceph_put_snap_context(snapc);
1210 return r == 0 ? -EAGAIN : r;
1211 }
1212 return 0;
1213}
1214
Sage Weil1d3576f2009-10-06 11:31:09 -07001215/*
1216 * We are only allowed to write into/dirty the page if the page is
1217 * clean, or already dirty within the same snap context.
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001218 */
1219static int ceph_write_begin(struct file *file, struct address_space *mapping,
David Howells78525c72021-08-11 09:49:13 +01001220 loff_t pos, unsigned len, unsigned aop_flags,
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001221 struct page **pagep, void **fsdata)
1222{
Al Viro496ad9a2013-01-23 17:07:38 -05001223 struct inode *inode = file_inode(file);
Jeff Layton1cc16992020-06-05 09:05:17 -04001224 struct ceph_inode_info *ci = ceph_inode(inode);
David Howells78525c72021-08-11 09:49:13 +01001225 struct folio *folio = NULL;
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001226 pgoff_t index = pos >> PAGE_SHIFT;
Jeff Laytond8013272020-06-05 10:43:21 -04001227 int r;
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001228
Jeff Laytond8013272020-06-05 10:43:21 -04001229 /*
1230 * Uninlining should have already been done and everything updated, EXCEPT
1231 * for inline_version sent to the MDS.
1232 */
1233 if (ci->i_inline_version != CEPH_INLINE_NONE) {
David Howells78525c72021-08-11 09:49:13 +01001234 unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
1235 if (aop_flags & AOP_FLAG_NOFS)
1236 fgp_flags |= FGP_NOFS;
1237 folio = __filemap_get_folio(mapping, index, fgp_flags,
1238 mapping_gfp_mask(mapping));
1239 if (!folio)
Jeff Laytond8013272020-06-05 10:43:21 -04001240 return -ENOMEM;
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001241
Jeff Laytond8013272020-06-05 10:43:21 -04001242 /*
1243 * The inline_version on a new inode is set to 1. If that's the
David Howells78525c72021-08-11 09:49:13 +01001244 * case, then the folio is brand new and isn't yet Uptodate.
Jeff Laytond8013272020-06-05 10:43:21 -04001245 */
1246 r = 0;
1247 if (index == 0 && ci->i_inline_version != 1) {
David Howells78525c72021-08-11 09:49:13 +01001248 if (!folio_test_uptodate(folio)) {
Jeff Laytond8013272020-06-05 10:43:21 -04001249 WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
1250 ci->i_inline_version);
1251 r = -EINVAL;
Jeff Layton1cc16992020-06-05 09:05:17 -04001252 }
Jeff Laytond8013272020-06-05 10:43:21 -04001253 goto out;
Jeff Layton1cc16992020-06-05 09:05:17 -04001254 }
David Howells78525c72021-08-11 09:49:13 +01001255 zero_user_segment(&folio->page, 0, folio_size(folio));
1256 folio_mark_uptodate(folio);
Jeff Laytond8013272020-06-05 10:43:21 -04001257 goto out;
Jeff Layton1cc16992020-06-05 09:05:17 -04001258 }
1259
David Howells78525c72021-08-11 09:49:13 +01001260 r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
Jeff Laytond8013272020-06-05 10:43:21 -04001261 &ceph_netfs_read_ops, NULL);
1262out:
1263 if (r == 0)
David Howells78525c72021-08-11 09:49:13 +01001264 folio_wait_fscache(folio);
Jeff Layton1cc16992020-06-05 09:05:17 -04001265 if (r < 0) {
David Howells78525c72021-08-11 09:49:13 +01001266 if (folio)
1267 folio_put(folio);
Jeff Layton1cc16992020-06-05 09:05:17 -04001268 } else {
David Howells78525c72021-08-11 09:49:13 +01001269 WARN_ON_ONCE(!folio_test_locked(folio));
1270 *pagep = &folio->page;
Jeff Layton1cc16992020-06-05 09:05:17 -04001271 }
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001272 return r;
1273}
1274
1275/*
Sage Weil1d3576f2009-10-06 11:31:09 -07001276 * we don't do anything in here that simple_write_end doesn't do
Yan, Zheng5dda377c2015-04-30 14:40:54 +08001277 * except adjust dirty page accounting
Sage Weil1d3576f2009-10-06 11:31:09 -07001278 */
1279static int ceph_write_end(struct file *file, struct address_space *mapping,
1280 loff_t pos, unsigned len, unsigned copied,
David Howells78525c72021-08-11 09:49:13 +01001281 struct page *subpage, void *fsdata)
Sage Weil1d3576f2009-10-06 11:31:09 -07001282{
David Howells78525c72021-08-11 09:49:13 +01001283 struct folio *folio = page_folio(subpage);
Al Viro496ad9a2013-01-23 17:07:38 -05001284 struct inode *inode = file_inode(file);
Yan, Zhengefb0ca72017-05-22 12:03:32 +08001285 bool check_cap = false;
Sage Weil1d3576f2009-10-06 11:31:09 -07001286
David Howells78525c72021-08-11 09:49:13 +01001287 dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file,
1288 inode, folio, (int)pos, (int)copied, (int)len);
Sage Weil1d3576f2009-10-06 11:31:09 -07001289
David Howells78525c72021-08-11 09:49:13 +01001290 if (!folio_test_uptodate(folio)) {
Jeff Laytonce3a8732021-06-14 07:15:38 -04001291 /* just return that nothing was copied on a short copy */
Al Virob9de3132016-09-05 22:20:03 -04001292 if (copied < len) {
1293 copied = 0;
1294 goto out;
1295 }
David Howells78525c72021-08-11 09:49:13 +01001296 folio_mark_uptodate(folio);
Al Virob9de3132016-09-05 22:20:03 -04001297 }
Sage Weil1d3576f2009-10-06 11:31:09 -07001298
1299 /* did file size increase? */
Yan, Zheng99c88e62015-12-30 11:32:46 +08001300 if (pos+copied > i_size_read(inode))
Sage Weil1d3576f2009-10-06 11:31:09 -07001301 check_cap = ceph_inode_set_size(inode, pos+copied);
1302
David Howells78525c72021-08-11 09:49:13 +01001303 folio_mark_dirty(folio);
Sage Weil1d3576f2009-10-06 11:31:09 -07001304
Al Virob9de3132016-09-05 22:20:03 -04001305out:
David Howells78525c72021-08-11 09:49:13 +01001306 folio_unlock(folio);
1307 folio_put(folio);
Sage Weil1d3576f2009-10-06 11:31:09 -07001308
1309 if (check_cap)
1310 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1311
1312 return copied;
1313}
1314
1315/*
1316 * we set .direct_IO to indicate direct io is supported, but since we
1317 * intercept O_DIRECT reads and writes early, this function should
1318 * never get called.
1319 */
Christoph Hellwigc8b8e322016-04-07 08:51:58 -07001320static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter)
Sage Weil1d3576f2009-10-06 11:31:09 -07001321{
1322 WARN_ON(1);
1323 return -EINVAL;
1324}
1325
1326const struct address_space_operations ceph_aops = {
1327 .readpage = ceph_readpage,
Jeff Layton49870052020-07-09 14:43:23 -04001328 .readahead = ceph_readahead,
Sage Weil1d3576f2009-10-06 11:31:09 -07001329 .writepage = ceph_writepage,
1330 .writepages = ceph_writepages_start,
1331 .write_begin = ceph_write_begin,
1332 .write_end = ceph_write_end,
1333 .set_page_dirty = ceph_set_page_dirty,
1334 .invalidatepage = ceph_invalidatepage,
1335 .releasepage = ceph_releasepage,
1336 .direct_IO = ceph_direct_io,
1337};
1338
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001339static void ceph_block_sigs(sigset_t *oldset)
1340{
1341 sigset_t mask;
1342 siginitsetinv(&mask, sigmask(SIGKILL));
1343 sigprocmask(SIG_BLOCK, &mask, oldset);
1344}
1345
1346static void ceph_restore_sigs(sigset_t *oldset)
1347{
1348 sigprocmask(SIG_SETMASK, oldset, NULL);
1349}
Sage Weil1d3576f2009-10-06 11:31:09 -07001350
1351/*
1352 * vm ops
1353 */
Souptick Joarder24499842018-07-23 21:32:24 +05301354static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf)
Yan, Zheng61f68812013-11-28 14:28:14 +08001355{
Dave Jiang11bac802017-02-24 14:56:41 -08001356 struct vm_area_struct *vma = vmf->vma;
Yan, Zheng61f68812013-11-28 14:28:14 +08001357 struct inode *inode = file_inode(vma->vm_file);
1358 struct ceph_inode_info *ci = ceph_inode(inode);
1359 struct ceph_file_info *fi = vma->vm_file->private_data;
Matthew Wilcox (Oracle)c403c3a22020-10-04 19:04:24 +01001360 loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT;
Souptick Joarder24499842018-07-23 21:32:24 +05301361 int want, got, err;
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001362 sigset_t oldset;
Souptick Joarder24499842018-07-23 21:32:24 +05301363 vm_fault_t ret = VM_FAULT_SIGBUS;
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001364
1365 ceph_block_sigs(&oldset);
Yan, Zheng61f68812013-11-28 14:28:14 +08001366
Jeff Layton8ff2d292021-04-05 10:40:56 -04001367 dout("filemap_fault %p %llx.%llx %llu trying to get caps\n",
1368 inode, ceph_vinop(inode), off);
Yan, Zheng61f68812013-11-28 14:28:14 +08001369 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1370 want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO;
1371 else
1372 want = CEPH_CAP_FILE_CACHE;
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001373
1374 got = 0;
Jeff Laytone72968e2021-04-05 12:19:35 -04001375 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got);
Souptick Joarder24499842018-07-23 21:32:24 +05301376 if (err < 0)
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001377 goto out_restore;
Yan, Zheng6ce026e2016-05-10 18:59:13 +08001378
Jeff Layton8ff2d292021-04-05 10:40:56 -04001379 dout("filemap_fault %p %llu got cap refs on %s\n",
1380 inode, off, ceph_cap_string(got));
Yan, Zheng61f68812013-11-28 14:28:14 +08001381
Yan, Zheng83701242014-11-14 22:36:18 +08001382 if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
Yan, Zheng2b1ac852016-10-25 10:51:55 +08001383 ci->i_inline_version == CEPH_INLINE_NONE) {
Yan, Zheng5d988302017-12-15 11:15:36 +08001384 CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
1385 ceph_add_rw_context(fi, &rw_ctx);
Dave Jiang11bac802017-02-24 14:56:41 -08001386 ret = filemap_fault(vmf);
Yan, Zheng5d988302017-12-15 11:15:36 +08001387 ceph_del_rw_context(fi, &rw_ctx);
Jeff Layton8ff2d292021-04-05 10:40:56 -04001388 dout("filemap_fault %p %llu drop cap refs %s ret %x\n",
1389 inode, off, ceph_cap_string(got), ret);
Yan, Zheng2b1ac852016-10-25 10:51:55 +08001390 } else
Souptick Joarder24499842018-07-23 21:32:24 +05301391 err = -EAGAIN;
Yan, Zheng61f68812013-11-28 14:28:14 +08001392
Yan, Zheng61f68812013-11-28 14:28:14 +08001393 ceph_put_cap_refs(ci, got);
1394
Souptick Joarder24499842018-07-23 21:32:24 +05301395 if (err != -EAGAIN)
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001396 goto out_restore;
Yan, Zheng83701242014-11-14 22:36:18 +08001397
1398 /* read inline data */
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001399 if (off >= PAGE_SIZE) {
Yan, Zheng83701242014-11-14 22:36:18 +08001400 /* does not support inline data > PAGE_SIZE */
1401 ret = VM_FAULT_SIGBUS;
1402 } else {
Yan, Zheng83701242014-11-14 22:36:18 +08001403 struct address_space *mapping = inode->i_mapping;
Jan Kara057ba5b2021-04-22 16:38:26 +02001404 struct page *page;
1405
1406 filemap_invalidate_lock_shared(mapping);
1407 page = find_or_create_page(mapping, 0,
1408 mapping_gfp_constraint(mapping, ~__GFP_FS));
Yan, Zheng83701242014-11-14 22:36:18 +08001409 if (!page) {
1410 ret = VM_FAULT_OOM;
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001411 goto out_inline;
Yan, Zheng83701242014-11-14 22:36:18 +08001412 }
Souptick Joarder24499842018-07-23 21:32:24 +05301413 err = __ceph_do_getattr(inode, page,
Yan, Zheng83701242014-11-14 22:36:18 +08001414 CEPH_STAT_CAP_INLINE_DATA, true);
Souptick Joarder24499842018-07-23 21:32:24 +05301415 if (err < 0 || off >= i_size_read(inode)) {
Yan, Zheng83701242014-11-14 22:36:18 +08001416 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001417 put_page(page);
Souptick Joarderc64a2b02019-01-05 01:00:29 +05301418 ret = vmf_error(err);
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001419 goto out_inline;
Yan, Zheng83701242014-11-14 22:36:18 +08001420 }
Souptick Joarder24499842018-07-23 21:32:24 +05301421 if (err < PAGE_SIZE)
1422 zero_user_segment(page, err, PAGE_SIZE);
Yan, Zheng83701242014-11-14 22:36:18 +08001423 else
1424 flush_dcache_page(page);
1425 SetPageUptodate(page);
1426 vmf->page = page;
1427 ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED;
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001428out_inline:
Jan Kara057ba5b2021-04-22 16:38:26 +02001429 filemap_invalidate_unlock_shared(mapping);
Jeff Layton8ff2d292021-04-05 10:40:56 -04001430 dout("filemap_fault %p %llu read inline data ret %x\n",
1431 inode, off, ret);
Yan, Zheng83701242014-11-14 22:36:18 +08001432 }
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001433out_restore:
1434 ceph_restore_sigs(&oldset);
Souptick Joarder24499842018-07-23 21:32:24 +05301435 if (err < 0)
1436 ret = vmf_error(err);
Yan, Zheng6ce026e2016-05-10 18:59:13 +08001437
Yan, Zheng61f68812013-11-28 14:28:14 +08001438 return ret;
1439}
Sage Weil1d3576f2009-10-06 11:31:09 -07001440
Souptick Joarder24499842018-07-23 21:32:24 +05301441static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
Sage Weil1d3576f2009-10-06 11:31:09 -07001442{
Dave Jiang11bac802017-02-24 14:56:41 -08001443 struct vm_area_struct *vma = vmf->vma;
Al Viro496ad9a2013-01-23 17:07:38 -05001444 struct inode *inode = file_inode(vma->vm_file);
Yan, Zheng61f68812013-11-28 14:28:14 +08001445 struct ceph_inode_info *ci = ceph_inode(inode);
1446 struct ceph_file_info *fi = vma->vm_file->private_data;
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001447 struct ceph_cap_flush *prealloc_cf;
Yan, Zheng61f68812013-11-28 14:28:14 +08001448 struct page *page = vmf->page;
Alex Elder6285bc22012-10-02 10:25:51 -05001449 loff_t off = page_offset(page);
Yan, Zheng61f68812013-11-28 14:28:14 +08001450 loff_t size = i_size_read(inode);
1451 size_t len;
Souptick Joarder24499842018-07-23 21:32:24 +05301452 int want, got, err;
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001453 sigset_t oldset;
Souptick Joarder24499842018-07-23 21:32:24 +05301454 vm_fault_t ret = VM_FAULT_SIGBUS;
Sage Weil1d3576f2009-10-06 11:31:09 -07001455
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001456 prealloc_cf = ceph_alloc_cap_flush();
1457 if (!prealloc_cf)
Yan, Zheng6ce026e2016-05-10 18:59:13 +08001458 return VM_FAULT_OOM;
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001459
Jeff Layton249c1df2019-08-01 10:06:40 -04001460 sb_start_pagefault(inode->i_sb);
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001461 ceph_block_sigs(&oldset);
Sage Weil1d3576f2009-10-06 11:31:09 -07001462
Yan, Zheng28127bd2014-11-14 22:38:29 +08001463 if (ci->i_inline_version != CEPH_INLINE_NONE) {
1464 struct page *locked_page = NULL;
1465 if (off == 0) {
1466 lock_page(page);
1467 locked_page = page;
1468 }
Souptick Joarder24499842018-07-23 21:32:24 +05301469 err = ceph_uninline_data(vma->vm_file, locked_page);
Yan, Zheng28127bd2014-11-14 22:38:29 +08001470 if (locked_page)
1471 unlock_page(locked_page);
Souptick Joarder24499842018-07-23 21:32:24 +05301472 if (err < 0)
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001473 goto out_free;
Yan, Zheng28127bd2014-11-14 22:38:29 +08001474 }
1475
Jeff Layton8ff2d292021-04-05 10:40:56 -04001476 if (off + thp_size(page) <= size)
1477 len = thp_size(page);
Sage Weil1d3576f2009-10-06 11:31:09 -07001478 else
Jeff Layton8ff2d292021-04-05 10:40:56 -04001479 len = offset_in_thp(page, size);
Sage Weil1d3576f2009-10-06 11:31:09 -07001480
Yan, Zheng61f68812013-11-28 14:28:14 +08001481 dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n",
1482 inode, ceph_vinop(inode), off, len, size);
1483 if (fi->fmode & CEPH_FILE_MODE_LAZY)
1484 want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
1485 else
1486 want = CEPH_CAP_FILE_BUFFER;
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001487
1488 got = 0;
Jeff Laytone72968e2021-04-05 12:19:35 -04001489 err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got);
Souptick Joarder24499842018-07-23 21:32:24 +05301490 if (err < 0)
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001491 goto out_free;
Yan, Zheng6ce026e2016-05-10 18:59:13 +08001492
Yan, Zheng61f68812013-11-28 14:28:14 +08001493 dout("page_mkwrite %p %llu~%zd got cap refs on %s\n",
1494 inode, off, len, ceph_cap_string(got));
1495
1496 /* Update time before taking page lock */
1497 file_update_time(vma->vm_file);
Jeff Layton5c308352019-06-06 08:57:27 -04001498 inode_inc_iversion_raw(inode);
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001499
Yan, Zhengf0b33df2016-05-10 19:09:06 +08001500 do {
Jeff Laytond45156b2020-05-28 14:59:49 -04001501 struct ceph_snap_context *snapc;
1502
Yan, Zhengf0b33df2016-05-10 19:09:06 +08001503 lock_page(page);
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001504
Andreas Gruenbachercb03c142020-02-13 21:24:22 +01001505 if (page_mkwrite_check_truncate(page, inode) < 0) {
Yan, Zhengf0b33df2016-05-10 19:09:06 +08001506 unlock_page(page);
1507 ret = VM_FAULT_NOPAGE;
1508 break;
1509 }
Yehuda Sadeh4af6b222010-02-09 11:02:51 -08001510
Jeff Laytond45156b2020-05-28 14:59:49 -04001511 snapc = ceph_find_incompatible(page);
1512 if (!snapc) {
Yan, Zhengf0b33df2016-05-10 19:09:06 +08001513 /* success. we'll keep the page locked. */
1514 set_page_dirty(page);
1515 ret = VM_FAULT_LOCKED;
Jeff Laytond45156b2020-05-28 14:59:49 -04001516 break;
Yan, Zhengf0b33df2016-05-10 19:09:06 +08001517 }
Jeff Laytond45156b2020-05-28 14:59:49 -04001518
1519 unlock_page(page);
1520
1521 if (IS_ERR(snapc)) {
1522 ret = VM_FAULT_SIGBUS;
1523 break;
1524 }
1525
1526 ceph_queue_writeback(inode);
1527 err = wait_event_killable(ci->i_cap_wq,
1528 context_is_writeable_or_written(inode, snapc));
1529 ceph_put_snap_context(snapc);
1530 } while (err == 0);
Yan, Zhengf0b33df2016-05-10 19:09:06 +08001531
Yan, Zheng28127bd2014-11-14 22:38:29 +08001532 if (ret == VM_FAULT_LOCKED ||
1533 ci->i_inline_version != CEPH_INLINE_NONE) {
Yan, Zheng61f68812013-11-28 14:28:14 +08001534 int dirty;
1535 spin_lock(&ci->i_ceph_lock);
Yan, Zheng28127bd2014-11-14 22:38:29 +08001536 ci->i_inline_version = CEPH_INLINE_NONE;
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001537 dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
1538 &prealloc_cf);
Yan, Zheng61f68812013-11-28 14:28:14 +08001539 spin_unlock(&ci->i_ceph_lock);
1540 if (dirty)
1541 __mark_inode_dirty(inode, dirty);
1542 }
1543
Souptick Joarder24499842018-07-23 21:32:24 +05301544 dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %x\n",
Yan, Zheng61f68812013-11-28 14:28:14 +08001545 inode, off, len, ceph_cap_string(got), ret);
Jeff Laytona8810cd2020-12-10 14:39:26 -05001546 ceph_put_cap_refs_async(ci, got);
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001547out_free:
Yan, Zheng4f7e89f2016-05-10 18:40:28 +08001548 ceph_restore_sigs(&oldset);
Jeff Layton249c1df2019-08-01 10:06:40 -04001549 sb_end_pagefault(inode->i_sb);
Yan, Zhengf66fd9f2015-06-10 17:26:13 +08001550 ceph_free_cap_flush(prealloc_cf);
Souptick Joarder24499842018-07-23 21:32:24 +05301551 if (err < 0)
1552 ret = vmf_error(err);
Sage Weil1d3576f2009-10-06 11:31:09 -07001553 return ret;
1554}
1555
Yan, Zheng31c542a2014-11-14 21:41:55 +08001556void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
1557 char *data, size_t len)
1558{
1559 struct address_space *mapping = inode->i_mapping;
1560 struct page *page;
1561
1562 if (locked_page) {
1563 page = locked_page;
1564 } else {
1565 if (i_size_read(inode) == 0)
1566 return;
1567 page = find_or_create_page(mapping, 0,
Michal Hockoc62d2552015-11-06 16:28:49 -08001568 mapping_gfp_constraint(mapping,
1569 ~__GFP_FS));
Yan, Zheng31c542a2014-11-14 21:41:55 +08001570 if (!page)
1571 return;
1572 if (PageUptodate(page)) {
1573 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001574 put_page(page);
Yan, Zheng31c542a2014-11-14 21:41:55 +08001575 return;
1576 }
1577 }
1578
Ilya Dryomov0668ff52014-12-19 13:10:10 +03001579 dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
Yan, Zheng31c542a2014-11-14 21:41:55 +08001580 inode, ceph_vinop(inode), len, locked_page);
1581
1582 if (len > 0) {
1583 void *kaddr = kmap_atomic(page);
1584 memcpy(kaddr, data, len);
1585 kunmap_atomic(kaddr);
1586 }
1587
1588 if (page != locked_page) {
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001589 if (len < PAGE_SIZE)
1590 zero_user_segment(page, len, PAGE_SIZE);
Yan, Zheng31c542a2014-11-14 21:41:55 +08001591 else
1592 flush_dcache_page(page);
1593
1594 SetPageUptodate(page);
1595 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001596 put_page(page);
Yan, Zheng31c542a2014-11-14 21:41:55 +08001597 }
1598}
1599
Yan, Zheng28127bd2014-11-14 22:38:29 +08001600int ceph_uninline_data(struct file *filp, struct page *locked_page)
1601{
1602 struct inode *inode = file_inode(filp);
1603 struct ceph_inode_info *ci = ceph_inode(inode);
1604 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
1605 struct ceph_osd_request *req;
1606 struct page *page = NULL;
1607 u64 len, inline_version;
1608 int err = 0;
1609 bool from_pagecache = false;
1610
1611 spin_lock(&ci->i_ceph_lock);
1612 inline_version = ci->i_inline_version;
1613 spin_unlock(&ci->i_ceph_lock);
1614
1615 dout("uninline_data %p %llx.%llx inline_version %llu\n",
1616 inode, ceph_vinop(inode), inline_version);
1617
1618 if (inline_version == 1 || /* initial version, no data */
1619 inline_version == CEPH_INLINE_NONE)
1620 goto out;
1621
1622 if (locked_page) {
1623 page = locked_page;
1624 WARN_ON(!PageUptodate(page));
1625 } else if (ceph_caps_issued(ci) &
1626 (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
1627 page = find_get_page(inode->i_mapping, 0);
1628 if (page) {
1629 if (PageUptodate(page)) {
1630 from_pagecache = true;
1631 lock_page(page);
1632 } else {
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001633 put_page(page);
Yan, Zheng28127bd2014-11-14 22:38:29 +08001634 page = NULL;
1635 }
1636 }
1637 }
1638
1639 if (page) {
1640 len = i_size_read(inode);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001641 if (len > PAGE_SIZE)
1642 len = PAGE_SIZE;
Yan, Zheng28127bd2014-11-14 22:38:29 +08001643 } else {
1644 page = __page_cache_alloc(GFP_NOFS);
1645 if (!page) {
1646 err = -ENOMEM;
1647 goto out;
1648 }
1649 err = __ceph_do_getattr(inode, page,
1650 CEPH_STAT_CAP_INLINE_DATA, true);
1651 if (err < 0) {
1652 /* no inline data */
1653 if (err == -ENODATA)
1654 err = 0;
1655 goto out;
1656 }
1657 len = err;
1658 }
1659
1660 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1661 ceph_vino(inode), 0, &len, 0, 1,
Ilya Dryomov54ea0042017-02-11 18:48:41 +01001662 CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_WRITE,
Ilya Dryomov34b759b2016-02-16 15:00:24 +01001663 NULL, 0, 0, false);
Yan, Zheng28127bd2014-11-14 22:38:29 +08001664 if (IS_ERR(req)) {
1665 err = PTR_ERR(req);
1666 goto out;
1667 }
1668
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001669 req->r_mtime = inode->i_mtime;
Yan, Zheng28127bd2014-11-14 22:38:29 +08001670 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1671 if (!err)
1672 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
1673 ceph_osdc_put_request(req);
1674 if (err < 0)
1675 goto out;
1676
1677 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
1678 ceph_vino(inode), 0, &len, 1, 3,
Ilya Dryomov54ea0042017-02-11 18:48:41 +01001679 CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE,
Ilya Dryomov34b759b2016-02-16 15:00:24 +01001680 NULL, ci->i_truncate_seq,
1681 ci->i_truncate_size, false);
Yan, Zheng28127bd2014-11-14 22:38:29 +08001682 if (IS_ERR(req)) {
1683 err = PTR_ERR(req);
1684 goto out;
1685 }
1686
1687 osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
1688
Yan, Zhengec137c12015-04-13 11:25:07 +08001689 {
1690 __le64 xattr_buf = cpu_to_le64(inline_version);
1691 err = osd_req_op_xattr_init(req, 0, CEPH_OSD_OP_CMPXATTR,
1692 "inline_version", &xattr_buf,
1693 sizeof(xattr_buf),
1694 CEPH_OSD_CMPXATTR_OP_GT,
1695 CEPH_OSD_CMPXATTR_MODE_U64);
1696 if (err)
1697 goto out_put;
1698 }
Yan, Zheng28127bd2014-11-14 22:38:29 +08001699
Yan, Zhengec137c12015-04-13 11:25:07 +08001700 {
1701 char xattr_buf[32];
1702 int xattr_len = snprintf(xattr_buf, sizeof(xattr_buf),
1703 "%llu", inline_version);
1704 err = osd_req_op_xattr_init(req, 2, CEPH_OSD_OP_SETXATTR,
1705 "inline_version",
1706 xattr_buf, xattr_len, 0, 0);
1707 if (err)
1708 goto out_put;
1709 }
Yan, Zheng28127bd2014-11-14 22:38:29 +08001710
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001711 req->r_mtime = inode->i_mtime;
Yan, Zheng28127bd2014-11-14 22:38:29 +08001712 err = ceph_osdc_start_request(&fsc->client->osdc, req, false);
1713 if (!err)
1714 err = ceph_osdc_wait_request(&fsc->client->osdc, req);
Xiubo Li97e27aa2020-03-19 23:45:01 -04001715
Xiubo Li8ae99ae2021-03-22 20:28:49 +08001716 ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
Xiubo Li903f4fec2021-05-13 09:40:53 +08001717 req->r_end_latency, len, err);
Xiubo Li97e27aa2020-03-19 23:45:01 -04001718
Yan, Zheng28127bd2014-11-14 22:38:29 +08001719out_put:
1720 ceph_osdc_put_request(req);
1721 if (err == -ECANCELED)
1722 err = 0;
1723out:
1724 if (page && page != locked_page) {
1725 if (from_pagecache) {
1726 unlock_page(page);
Kirill A. Shutemov09cbfea2016-04-01 15:29:47 +03001727 put_page(page);
Yan, Zheng28127bd2014-11-14 22:38:29 +08001728 } else
1729 __free_pages(page, 0);
1730 }
1731
1732 dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
1733 inode, ceph_vinop(inode), inline_version, err);
1734 return err;
1735}
1736
Kirill A. Shutemov7cbea8d2015-09-09 15:39:26 -07001737static const struct vm_operations_struct ceph_vmops = {
Yan, Zheng61f68812013-11-28 14:28:14 +08001738 .fault = ceph_filemap_fault,
Sage Weil1d3576f2009-10-06 11:31:09 -07001739 .page_mkwrite = ceph_page_mkwrite,
1740};
1741
1742int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1743{
1744 struct address_space *mapping = file->f_mapping;
1745
1746 if (!mapping->a_ops->readpage)
1747 return -ENOEXEC;
1748 file_accessed(file);
1749 vma->vm_ops = &ceph_vmops;
Sage Weil1d3576f2009-10-06 11:31:09 -07001750 return 0;
1751}
Yan, Zheng10183a62015-04-27 15:33:28 +08001752
1753enum {
1754 POOL_READ = 1,
1755 POOL_WRITE = 2,
1756};
1757
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001758static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
1759 s64 pool, struct ceph_string *pool_ns)
Yan, Zheng10183a62015-04-27 15:33:28 +08001760{
1761 struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
1762 struct ceph_mds_client *mdsc = fsc->mdsc;
1763 struct ceph_osd_request *rd_req = NULL, *wr_req = NULL;
1764 struct rb_node **p, *parent;
1765 struct ceph_pool_perm *perm;
1766 struct page **pages;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001767 size_t pool_ns_len;
Yan, Zheng10183a62015-04-27 15:33:28 +08001768 int err = 0, err2 = 0, have = 0;
1769
1770 down_read(&mdsc->pool_perm_rwsem);
1771 p = &mdsc->pool_perm_tree.rb_node;
1772 while (*p) {
1773 perm = rb_entry(*p, struct ceph_pool_perm, node);
1774 if (pool < perm->pool)
1775 p = &(*p)->rb_left;
1776 else if (pool > perm->pool)
1777 p = &(*p)->rb_right;
1778 else {
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001779 int ret = ceph_compare_string(pool_ns,
1780 perm->pool_ns,
1781 perm->pool_ns_len);
1782 if (ret < 0)
1783 p = &(*p)->rb_left;
1784 else if (ret > 0)
1785 p = &(*p)->rb_right;
1786 else {
1787 have = perm->perm;
1788 break;
1789 }
Yan, Zheng10183a62015-04-27 15:33:28 +08001790 }
1791 }
1792 up_read(&mdsc->pool_perm_rwsem);
1793 if (*p)
1794 goto out;
1795
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001796 if (pool_ns)
1797 dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
1798 pool, (int)pool_ns->len, pool_ns->str);
1799 else
1800 dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
Yan, Zheng10183a62015-04-27 15:33:28 +08001801
1802 down_write(&mdsc->pool_perm_rwsem);
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001803 p = &mdsc->pool_perm_tree.rb_node;
Yan, Zheng10183a62015-04-27 15:33:28 +08001804 parent = NULL;
1805 while (*p) {
1806 parent = *p;
1807 perm = rb_entry(parent, struct ceph_pool_perm, node);
1808 if (pool < perm->pool)
1809 p = &(*p)->rb_left;
1810 else if (pool > perm->pool)
1811 p = &(*p)->rb_right;
1812 else {
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001813 int ret = ceph_compare_string(pool_ns,
1814 perm->pool_ns,
1815 perm->pool_ns_len);
1816 if (ret < 0)
1817 p = &(*p)->rb_left;
1818 else if (ret > 0)
1819 p = &(*p)->rb_right;
1820 else {
1821 have = perm->perm;
1822 break;
1823 }
Yan, Zheng10183a62015-04-27 15:33:28 +08001824 }
1825 }
1826 if (*p) {
1827 up_write(&mdsc->pool_perm_rwsem);
1828 goto out;
1829 }
1830
Ilya Dryomov34b759b2016-02-16 15:00:24 +01001831 rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
Yan, Zheng10183a62015-04-27 15:33:28 +08001832 1, false, GFP_NOFS);
1833 if (!rd_req) {
1834 err = -ENOMEM;
1835 goto out_unlock;
1836 }
1837
1838 rd_req->r_flags = CEPH_OSD_FLAG_READ;
1839 osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
1840 rd_req->r_base_oloc.pool = pool;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001841 if (pool_ns)
1842 rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
Ilya Dryomovd30291b2016-04-29 19:54:20 +02001843 ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
Yan, Zheng10183a62015-04-27 15:33:28 +08001844
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02001845 err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
1846 if (err)
1847 goto out_unlock;
Yan, Zheng10183a62015-04-27 15:33:28 +08001848
Ilya Dryomov34b759b2016-02-16 15:00:24 +01001849 wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
Yan, Zheng10183a62015-04-27 15:33:28 +08001850 1, false, GFP_NOFS);
1851 if (!wr_req) {
1852 err = -ENOMEM;
1853 goto out_unlock;
1854 }
1855
Ilya Dryomov54ea0042017-02-11 18:48:41 +01001856 wr_req->r_flags = CEPH_OSD_FLAG_WRITE;
Yan, Zheng10183a62015-04-27 15:33:28 +08001857 osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL);
Ilya Dryomov63244fa2016-04-28 16:07:23 +02001858 ceph_oloc_copy(&wr_req->r_base_oloc, &rd_req->r_base_oloc);
Ilya Dryomovd30291b2016-04-29 19:54:20 +02001859 ceph_oid_copy(&wr_req->r_base_oid, &rd_req->r_base_oid);
Yan, Zheng10183a62015-04-27 15:33:28 +08001860
Ilya Dryomov13d1ad12016-04-27 14:15:51 +02001861 err = ceph_osdc_alloc_messages(wr_req, GFP_NOFS);
1862 if (err)
1863 goto out_unlock;
Yan, Zheng10183a62015-04-27 15:33:28 +08001864
1865 /* one page should be large enough for STAT data */
1866 pages = ceph_alloc_page_vector(1, GFP_KERNEL);
1867 if (IS_ERR(pages)) {
1868 err = PTR_ERR(pages);
1869 goto out_unlock;
1870 }
1871
1872 osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE,
1873 0, false, true);
Yan, Zheng10183a62015-04-27 15:33:28 +08001874 err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false);
1875
Arnd Bergmannfac02dd2018-07-13 22:18:37 +02001876 wr_req->r_mtime = ci->vfs_inode.i_mtime;
Yan, Zheng10183a62015-04-27 15:33:28 +08001877 err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false);
1878
1879 if (!err)
1880 err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req);
1881 if (!err2)
1882 err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req);
1883
1884 if (err >= 0 || err == -ENOENT)
1885 have |= POOL_READ;
Yan, Zheng131d7eb2019-07-25 20:16:47 +08001886 else if (err != -EPERM) {
Ilya Dryomov0b98acd2020-09-14 13:39:19 +02001887 if (err == -EBLOCKLISTED)
1888 fsc->blocklisted = true;
Yan, Zheng10183a62015-04-27 15:33:28 +08001889 goto out_unlock;
Yan, Zheng131d7eb2019-07-25 20:16:47 +08001890 }
Yan, Zheng10183a62015-04-27 15:33:28 +08001891
1892 if (err2 == 0 || err2 == -EEXIST)
1893 have |= POOL_WRITE;
1894 else if (err2 != -EPERM) {
Ilya Dryomov0b98acd2020-09-14 13:39:19 +02001895 if (err2 == -EBLOCKLISTED)
1896 fsc->blocklisted = true;
Yan, Zheng10183a62015-04-27 15:33:28 +08001897 err = err2;
1898 goto out_unlock;
1899 }
1900
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001901 pool_ns_len = pool_ns ? pool_ns->len : 0;
1902 perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
Yan, Zheng10183a62015-04-27 15:33:28 +08001903 if (!perm) {
1904 err = -ENOMEM;
1905 goto out_unlock;
1906 }
1907
1908 perm->pool = pool;
1909 perm->perm = have;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001910 perm->pool_ns_len = pool_ns_len;
1911 if (pool_ns_len > 0)
1912 memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
1913 perm->pool_ns[pool_ns_len] = 0;
1914
Yan, Zheng10183a62015-04-27 15:33:28 +08001915 rb_link_node(&perm->node, parent, p);
1916 rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
1917 err = 0;
1918out_unlock:
1919 up_write(&mdsc->pool_perm_rwsem);
1920
Ilya Dryomov3ed97d62016-04-26 15:05:29 +02001921 ceph_osdc_put_request(rd_req);
1922 ceph_osdc_put_request(wr_req);
Yan, Zheng10183a62015-04-27 15:33:28 +08001923out:
1924 if (!err)
1925 err = have;
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001926 if (pool_ns)
1927 dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
1928 pool, (int)pool_ns->len, pool_ns->str, err);
1929 else
1930 dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
Yan, Zheng10183a62015-04-27 15:33:28 +08001931 return err;
1932}
1933
Yan, Zheng5e3ded12019-07-25 20:16:43 +08001934int ceph_pool_perm_check(struct inode *inode, int need)
Yan, Zheng10183a62015-04-27 15:33:28 +08001935{
Yan, Zheng5e3ded12019-07-25 20:16:43 +08001936 struct ceph_inode_info *ci = ceph_inode(inode);
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001937 struct ceph_string *pool_ns;
Yan, Zheng5e3ded12019-07-25 20:16:43 +08001938 s64 pool;
Yan, Zheng10183a62015-04-27 15:33:28 +08001939 int ret, flags;
1940
Jeff Laytone9b22502021-01-26 11:49:54 -05001941 /* Only need to do this for regular files */
1942 if (!S_ISREG(inode->i_mode))
1943 return 0;
1944
Yan, Zheng80e80fb2016-12-13 16:03:26 +08001945 if (ci->i_vino.snap != CEPH_NOSNAP) {
1946 /*
1947 * Pool permission check needs to write to the first object.
1948 * But for snapshot, head of the first object may have alread
1949 * been deleted. Skip check to avoid creating orphan object.
1950 */
1951 return 0;
1952 }
1953
Yan, Zheng5e3ded12019-07-25 20:16:43 +08001954 if (ceph_test_mount_opt(ceph_inode_to_client(inode),
Yan, Zheng10183a62015-04-27 15:33:28 +08001955 NOPOOLPERM))
1956 return 0;
1957
1958 spin_lock(&ci->i_ceph_lock);
1959 flags = ci->i_ceph_flags;
Yan, Zheng76271512016-02-03 21:24:49 +08001960 pool = ci->i_layout.pool_id;
Yan, Zheng10183a62015-04-27 15:33:28 +08001961 spin_unlock(&ci->i_ceph_lock);
1962check:
1963 if (flags & CEPH_I_POOL_PERM) {
1964 if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
Yan, Zheng76271512016-02-03 21:24:49 +08001965 dout("ceph_pool_perm_check pool %lld no read perm\n",
Yan, Zheng10183a62015-04-27 15:33:28 +08001966 pool);
1967 return -EPERM;
1968 }
1969 if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
Yan, Zheng76271512016-02-03 21:24:49 +08001970 dout("ceph_pool_perm_check pool %lld no write perm\n",
Yan, Zheng10183a62015-04-27 15:33:28 +08001971 pool);
1972 return -EPERM;
1973 }
1974 return 0;
1975 }
1976
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001977 pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
1978 ret = __ceph_pool_perm_get(ci, pool, pool_ns);
1979 ceph_put_string(pool_ns);
Yan, Zheng10183a62015-04-27 15:33:28 +08001980 if (ret < 0)
1981 return ret;
1982
1983 flags = CEPH_I_POOL_PERM;
1984 if (ret & POOL_READ)
1985 flags |= CEPH_I_POOL_RD;
1986 if (ret & POOL_WRITE)
1987 flags |= CEPH_I_POOL_WR;
1988
1989 spin_lock(&ci->i_ceph_lock);
Yan, Zheng779fe0f2016-03-07 09:35:06 +08001990 if (pool == ci->i_layout.pool_id &&
1991 pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
1992 ci->i_ceph_flags |= flags;
Yan, Zheng10183a62015-04-27 15:33:28 +08001993 } else {
Yan, Zheng76271512016-02-03 21:24:49 +08001994 pool = ci->i_layout.pool_id;
Yan, Zheng10183a62015-04-27 15:33:28 +08001995 flags = ci->i_ceph_flags;
1996 }
1997 spin_unlock(&ci->i_ceph_lock);
1998 goto check;
1999}
2000
2001void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc)
2002{
2003 struct ceph_pool_perm *perm;
2004 struct rb_node *n;
2005
2006 while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) {
2007 n = rb_first(&mdsc->pool_perm_tree);
2008 perm = rb_entry(n, struct ceph_pool_perm, node);
2009 rb_erase(n, &mdsc->pool_perm_tree);
2010 kfree(perm);
2011 }
2012}