Blame - fs/iomap.c - SHIFTPHONES/kernel/common

blob: 2a20d50f0c1d5936830c527e1b0ee0c09a5357e1 [file] [log] [blame]

Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	1	/*
				2	* Copyright (C) 2010 Red Hat, Inc.
				3	* Copyright (c) 2016 Christoph Hellwig.
				4	*
				5	* This program is free software; you can redistribute it and/or modify it
				6	* under the terms and conditions of the GNU General Public License,
				7	* version 2, as published by the Free Software Foundation.
				8	*
				9	* This program is distributed in the hope it will be useful, but WITHOUT
				10	* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
				11	* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
				12	* more details.
				13	*/
				14	#include <linux/module.h>
				15	#include <linux/compiler.h>
				16	#include <linux/fs.h>
				17	#include <linux/iomap.h>
				18	#include <linux/uaccess.h>
				19	#include <linux/gfp.h>
				20	#include <linux/mm.h>
				21	#include <linux/swap.h>
				22	#include <linux/pagemap.h>
Christoph Hellwig	8a78cb1	2018-06-01 09:04:40 -0700	[diff] [blame]	23	#include <linux/pagevec.h>
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	24	#include <linux/file.h>
				25	#include <linux/uio.h>
				26	#include <linux/backing-dev.h>
				27	#include <linux/buffer_head.h>
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	28	#include <linux/task_io_accounting_ops.h>
Christoph Hellwig	9a286f0	2016-06-21 09:31:39 +1000	[diff] [blame]	29	#include <linux/dax.h>
Ingo Molnar	f361bf4	2017-02-03 23:47:37 +0100	[diff] [blame]	30	#include <linux/sched/signal.h>
Darrick J. Wong	6748212	2018-05-10 08:38:15 -0700	[diff] [blame]	31	#include <linux/swap.h>
Ingo Molnar	f361bf4	2017-02-03 23:47:37 +0100	[diff] [blame]	32
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	33	#include "internal.h"
				34
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	35	/*
				36	* Execute a iomap write on a segment of the mapping that spans a
				37	* contiguous range of pages that have identical block mapping state.
				38	*
				39	* This avoids the need to map pages individually, do individual allocations
				40	* for each page and most importantly avoid the need for filesystem specific
				41	* locking per page. Instead, all the operations are amortised over the entire
				42	* range of pages. It is assumed that the filesystems will lock whatever
				43	* resources they require in the iomap_begin call, and release them in the
				44	* iomap_end call.
				45	*/
Christoph Hellwig	befb503	2016-09-19 11:24:49 +1000	[diff] [blame]	46	loff_t
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	47	iomap_apply(struct inode *inode, loff_t pos, loff_t length, unsigned flags,
Christoph Hellwig	8ff6daa	2017-01-27 23:20:26 -0800	[diff] [blame]	48	const struct iomap_ops ops, void data, iomap_actor_t actor)
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	49	{
				50	struct iomap iomap = { 0 };
				51	loff_t written = 0, ret;
				52
				53	/*
				54	* Need to map a range from start position for length bytes. This can
				55	* span multiple pages - it is only guaranteed to return a range of a
				56	* single type of pages (e.g. all into a hole, all mapped or all
				57	* unwritten). Failure at this point has nothing to undo.
				58	*
				59	* If allocation is required for this range, reserve the space now so
				60	* that the allocation is guaranteed to succeed later on. Once we copy
				61	* the data into the page cache pages, then we cannot fail otherwise we
				62	* expose transient stale data. If the reserve fails, we can safely
				63	* back out at this point as there is nothing to undo.
				64	*/
				65	ret = ops->iomap_begin(inode, pos, length, flags, &iomap);
				66	if (ret)
				67	return ret;
				68	if (WARN_ON(iomap.offset > pos))
				69	return -EIO;
Darrick J. Wong	0c6dda7	2018-01-26 11:11:20 -0800	[diff] [blame]	70	if (WARN_ON(iomap.length == 0))
				71	return -EIO;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	72
				73	/*
				74	* Cut down the length to the one actually provided by the filesystem,
				75	* as it might not be able to give us the whole size that we requested.
				76	*/
				77	if (iomap.offset + iomap.length < pos + length)
				78	length = iomap.offset + iomap.length - pos;
				79
				80	/*
				81	* Now that we have guaranteed that the space allocation will succeed.
				82	* we can do the copy-in page by page without having to worry about
				83	* failures exposing transient data.
				84	*/
				85	written = actor(inode, pos, length, data, &iomap);
				86
				87	/*
				88	* Now the data has been copied, commit the range we've copied. This
				89	* should not fail unless the filesystem has had a fatal error.
				90	*/
Christoph Hellwig	f20ac7a	2016-08-17 08:42:34 +1000	[diff] [blame]	91	if (ops->iomap_end) {
				92	ret = ops->iomap_end(inode, pos, length,
				93	written > 0 ? written : 0,
				94	flags, &iomap);
				95	}
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	96
				97	return written ? written : ret;
				98	}
				99
Christoph Hellwig	57fc505	2018-06-01 09:03:08 -0700	[diff] [blame]	100	static sector_t
				101	iomap_sector(struct iomap *iomap, loff_t pos)
				102	{
				103	return (iomap->addr + pos - iomap->offset) >> SECTOR_SHIFT;
				104	}
				105
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	106	static void
				107	iomap_write_failed(struct inode *inode, loff_t pos, unsigned len)
				108	{
				109	loff_t i_size = i_size_read(inode);
				110
				111	/*
				112	* Only truncate newly allocated pages beyoned EOF, even if the
				113	* write started inside the existing inode size.
				114	*/
				115	if (pos + len > i_size)
				116	truncate_pagecache_range(inode, max(pos, i_size), pos + len);
				117	}
				118
				119	static int
				120	iomap_write_begin(struct inode *inode, loff_t pos, unsigned len, unsigned flags,
				121	struct page *pagep, struct iomap iomap)
				122	{
				123	pgoff_t index = pos >> PAGE_SHIFT;
				124	struct page *page;
				125	int status = 0;
				126
				127	BUG_ON(pos + len > iomap->offset + iomap->length);
				128
Michal Hocko	d1908f5	2017-02-03 13:13:26 -0800	[diff] [blame]	129	if (fatal_signal_pending(current))
				130	return -EINTR;
				131
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	132	page = grab_cache_page_write_begin(inode->i_mapping, index, flags);
				133	if (!page)
				134	return -ENOMEM;
				135
				136	status = __block_write_begin_int(page, pos, len, NULL, iomap);
				137	if (unlikely(status)) {
				138	unlock_page(page);
				139	put_page(page);
				140	page = NULL;
				141
				142	iomap_write_failed(inode, pos, len);
				143	}
				144
				145	*pagep = page;
				146	return status;
				147	}
				148
				149	static int
				150	iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
				151	unsigned copied, struct page *page)
				152	{
				153	int ret;
				154
				155	ret = generic_write_end(NULL, inode->i_mapping, pos, len,
				156	copied, page, NULL);
				157	if (ret < len)
				158	iomap_write_failed(inode, pos, len);
				159	return ret;
				160	}
				161
				162	static loff_t
				163	iomap_write_actor(struct inode inode, loff_t pos, loff_t length, void data,
				164	struct iomap *iomap)
				165	{
				166	struct iov_iter *i = data;
				167	long status = 0;
				168	ssize_t written = 0;
				169	unsigned int flags = AOP_FLAG_NOFS;
				170
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	171	do {
				172	struct page *page;
				173	unsigned long offset; /* Offset into pagecache page */
				174	unsigned long bytes; /* Bytes to write to page */
				175	size_t copied; /* Bytes copied from user */
				176
				177	offset = (pos & (PAGE_SIZE - 1));
				178	bytes = min_t(unsigned long, PAGE_SIZE - offset,
				179	iov_iter_count(i));
				180	again:
				181	if (bytes > length)
				182	bytes = length;
				183
				184	/*
				185	* Bring in the user page that we will copy from _first_.
				186	* Otherwise there's a nasty deadlock on copying from the
				187	* same page as we're writing to, without it being marked
				188	* up-to-date.
				189	*
				190	* Not only is this an optimisation, but it is also required
				191	* to check that the address is actually valid, when atomic
				192	* usercopies are used, below.
				193	*/
				194	if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
				195	status = -EFAULT;
				196	break;
				197	}
				198
				199	status = iomap_write_begin(inode, pos, bytes, flags, &page,
				200	iomap);
				201	if (unlikely(status))
				202	break;
				203
				204	if (mapping_writably_mapped(inode->i_mapping))
				205	flush_dcache_page(page);
				206
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	207	copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	208
				209	flush_dcache_page(page);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	210
				211	status = iomap_write_end(inode, pos, bytes, copied, page);
				212	if (unlikely(status < 0))
				213	break;
				214	copied = status;
				215
				216	cond_resched();
				217
				218	iov_iter_advance(i, copied);
				219	if (unlikely(copied == 0)) {
				220	/*
				221	* If we were unable to copy any data at all, we must
				222	* fall back to a single segment length write.
				223	*
				224	* If we didn't fallback here, we could livelock
				225	* because not all segments in the iov can be copied at
				226	* once without a pagefault.
				227	*/
				228	bytes = min_t(unsigned long, PAGE_SIZE - offset,
				229	iov_iter_single_seg_count(i));
				230	goto again;
				231	}
				232	pos += copied;
				233	written += copied;
				234	length -= copied;
				235
				236	balance_dirty_pages_ratelimited(inode->i_mapping);
				237	} while (iov_iter_count(i) && length);
				238
				239	return written ? written : status;
				240	}
				241
				242	ssize_t
				243	iomap_file_buffered_write(struct kiocb iocb, struct iov_iter iter,
Christoph Hellwig	8ff6daa	2017-01-27 23:20:26 -0800	[diff] [blame]	244	const struct iomap_ops *ops)
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	245	{
				246	struct inode *inode = iocb->ki_filp->f_mapping->host;
				247	loff_t pos = iocb->ki_pos, ret = 0, written = 0;
				248
				249	while (iov_iter_count(iter)) {
				250	ret = iomap_apply(inode, pos, iov_iter_count(iter),
				251	IOMAP_WRITE, ops, iter, iomap_write_actor);
				252	if (ret <= 0)
				253	break;
				254	pos += ret;
				255	written += ret;
				256	}
				257
				258	return written ? written : ret;
				259	}
				260	EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
				261
Christoph Hellwig	5f4e575	2016-09-19 10:12:45 +1000	[diff] [blame]	262	static struct page *
				263	__iomap_read_page(struct inode *inode, loff_t offset)
				264	{
				265	struct address_space *mapping = inode->i_mapping;
				266	struct page *page;
				267
				268	page = read_mapping_page(mapping, offset >> PAGE_SHIFT, NULL);
				269	if (IS_ERR(page))
				270	return page;
				271	if (!PageUptodate(page)) {
				272	put_page(page);
				273	return ERR_PTR(-EIO);
				274	}
				275	return page;
				276	}
				277
				278	static loff_t
				279	iomap_dirty_actor(struct inode inode, loff_t pos, loff_t length, void data,
				280	struct iomap *iomap)
				281	{
				282	long status = 0;
				283	ssize_t written = 0;
				284
				285	do {
				286	struct page page, rpage;
				287	unsigned long offset; /* Offset into pagecache page */
				288	unsigned long bytes; /* Bytes to write to page */
				289
				290	offset = (pos & (PAGE_SIZE - 1));
Christoph Hellwig	e28ae8e	2017-08-11 12:45:35 -0700	[diff] [blame]	291	bytes = min_t(loff_t, PAGE_SIZE - offset, length);
Christoph Hellwig	5f4e575	2016-09-19 10:12:45 +1000	[diff] [blame]	292
				293	rpage = __iomap_read_page(inode, pos);
				294	if (IS_ERR(rpage))
				295	return PTR_ERR(rpage);
				296
				297	status = iomap_write_begin(inode, pos, bytes,
Tetsuo Handa	c718a97	2017-05-08 15:58:59 -0700	[diff] [blame]	298	AOP_FLAG_NOFS, &page, iomap);
Christoph Hellwig	5f4e575	2016-09-19 10:12:45 +1000	[diff] [blame]	299	put_page(rpage);
				300	if (unlikely(status))
				301	return status;
				302
				303	WARN_ON_ONCE(!PageUptodate(page));
				304
				305	status = iomap_write_end(inode, pos, bytes, bytes, page);
				306	if (unlikely(status <= 0)) {
				307	if (WARN_ON_ONCE(status == 0))
				308	return -EIO;
				309	return status;
				310	}
				311
				312	cond_resched();
				313
				314	pos += status;
				315	written += status;
				316	length -= status;
				317
				318	balance_dirty_pages_ratelimited(inode->i_mapping);
				319	} while (length);
				320
				321	return written;
				322	}
				323
				324	int
				325	iomap_file_dirty(struct inode *inode, loff_t pos, loff_t len,
Christoph Hellwig	8ff6daa	2017-01-27 23:20:26 -0800	[diff] [blame]	326	const struct iomap_ops *ops)
Christoph Hellwig	5f4e575	2016-09-19 10:12:45 +1000	[diff] [blame]	327	{
				328	loff_t ret;
				329
				330	while (len) {
				331	ret = iomap_apply(inode, pos, len, IOMAP_WRITE, ops, NULL,
				332	iomap_dirty_actor);
				333	if (ret <= 0)
				334	return ret;
				335	pos += ret;
				336	len -= ret;
				337	}
				338
				339	return 0;
				340	}
				341	EXPORT_SYMBOL_GPL(iomap_file_dirty);
				342
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	343	static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset,
				344	unsigned bytes, struct iomap *iomap)
				345	{
				346	struct page *page;
				347	int status;
				348
Tetsuo Handa	c718a97	2017-05-08 15:58:59 -0700	[diff] [blame]	349	status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page,
				350	iomap);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	351	if (status)
				352	return status;
				353
				354	zero_user(page, offset, bytes);
				355	mark_page_accessed(page);
				356
				357	return iomap_write_end(inode, pos, bytes, bytes, page);
				358	}
				359
Christoph Hellwig	9a286f0	2016-06-21 09:31:39 +1000	[diff] [blame]	360	static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes,
				361	struct iomap *iomap)
				362	{
Christoph Hellwig	57fc505	2018-06-01 09:03:08 -0700	[diff] [blame]	363	return __dax_zero_page_range(iomap->bdev, iomap->dax_dev,
				364	iomap_sector(iomap, pos & PAGE_MASK), offset, bytes);
Christoph Hellwig	9a286f0	2016-06-21 09:31:39 +1000	[diff] [blame]	365	}
				366
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	367	static loff_t
				368	iomap_zero_range_actor(struct inode *inode, loff_t pos, loff_t count,
				369	void data, struct iomap iomap)
				370	{
				371	bool *did_zero = data;
				372	loff_t written = 0;
				373	int status;
				374
				375	/* already zeroed? we're done. */
				376	if (iomap->type == IOMAP_HOLE \|\| iomap->type == IOMAP_UNWRITTEN)
				377	return count;
				378
				379	do {
				380	unsigned offset, bytes;
				381
				382	offset = pos & (PAGE_SIZE - 1); /* Within page */
Christoph Hellwig	e28ae8e	2017-08-11 12:45:35 -0700	[diff] [blame]	383	bytes = min_t(loff_t, PAGE_SIZE - offset, count);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	384
Christoph Hellwig	9a286f0	2016-06-21 09:31:39 +1000	[diff] [blame]	385	if (IS_DAX(inode))
				386	status = iomap_dax_zero(pos, offset, bytes, iomap);
				387	else
				388	status = iomap_zero(inode, pos, offset, bytes, iomap);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	389	if (status < 0)
				390	return status;
				391
				392	pos += bytes;
				393	count -= bytes;
				394	written += bytes;
				395	if (did_zero)
				396	*did_zero = true;
				397	} while (count > 0);
				398
				399	return written;
				400	}
				401
				402	int
				403	iomap_zero_range(struct inode inode, loff_t pos, loff_t len, bool did_zero,
Christoph Hellwig	8ff6daa	2017-01-27 23:20:26 -0800	[diff] [blame]	404	const struct iomap_ops *ops)
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	405	{
				406	loff_t ret;
				407
				408	while (len > 0) {
				409	ret = iomap_apply(inode, pos, len, IOMAP_ZERO,
				410	ops, did_zero, iomap_zero_range_actor);
				411	if (ret <= 0)
				412	return ret;
				413
				414	pos += ret;
				415	len -= ret;
				416	}
				417
				418	return 0;
				419	}
				420	EXPORT_SYMBOL_GPL(iomap_zero_range);
				421
				422	int
				423	iomap_truncate_page(struct inode inode, loff_t pos, bool did_zero,
Christoph Hellwig	8ff6daa	2017-01-27 23:20:26 -0800	[diff] [blame]	424	const struct iomap_ops *ops)
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	425	{
Fabian Frederick	9340747	2017-02-27 14:28:32 -0800	[diff] [blame]	426	unsigned int blocksize = i_blocksize(inode);
				427	unsigned int off = pos & (blocksize - 1);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	428
				429	/* Block boundary? Nothing to do */
				430	if (!off)
				431	return 0;
				432	return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops);
				433	}
				434	EXPORT_SYMBOL_GPL(iomap_truncate_page);
				435
				436	static loff_t
				437	iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
				438	void data, struct iomap iomap)
				439	{
				440	struct page *page = data;
				441	int ret;
				442
Jan Kara	c663e29	2016-10-24 14:20:25 +1100	[diff] [blame]	443	ret = __block_write_begin_int(page, pos, length, NULL, iomap);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	444	if (ret)
				445	return ret;
				446
				447	block_commit_write(page, 0, length);
				448	return length;
				449	}
				450
Dave Jiang	11bac80	2017-02-24 14:56:41 -0800	[diff] [blame]	451	int iomap_page_mkwrite(struct vm_fault vmf, const struct iomap_ops ops)
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	452	{
				453	struct page *page = vmf->page;
Dave Jiang	11bac80	2017-02-24 14:56:41 -0800	[diff] [blame]	454	struct inode *inode = file_inode(vmf->vma->vm_file);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	455	unsigned long length;
				456	loff_t offset, size;
				457	ssize_t ret;
				458
				459	lock_page(page);
				460	size = i_size_read(inode);
				461	if ((page->mapping != inode->i_mapping) \|\|
				462	(page_offset(page) > size)) {
				463	/* We overload EFAULT to mean page got truncated */
				464	ret = -EFAULT;
				465	goto out_unlock;
				466	}
				467
				468	/* page is wholly or partially inside EOF */
				469	if (((page->index + 1) << PAGE_SHIFT) > size)
				470	length = size & ~PAGE_MASK;
				471	else
				472	length = PAGE_SIZE;
				473
				474	offset = page_offset(page);
				475	while (length > 0) {
Jan Kara	9484ab1	2016-11-10 10:26:50 +1100	[diff] [blame]	476	ret = iomap_apply(inode, offset, length,
				477	IOMAP_WRITE \| IOMAP_FAULT, ops, page,
				478	iomap_page_mkwrite_actor);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	479	if (unlikely(ret <= 0))
				480	goto out_unlock;
				481	offset += ret;
				482	length -= ret;
				483	}
				484
				485	set_page_dirty(page);
				486	wait_for_stable_page(page);
Christoph Hellwig	e7647fb	2017-08-29 10:08:41 -0700	[diff] [blame]	487	return VM_FAULT_LOCKED;
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	488	out_unlock:
				489	unlock_page(page);
Christoph Hellwig	e7647fb	2017-08-29 10:08:41 -0700	[diff] [blame]	490	return block_page_mkwrite_return(ret);
Christoph Hellwig	ae259a9	2016-06-21 09:23:11 +1000	[diff] [blame]	491	}
				492	EXPORT_SYMBOL_GPL(iomap_page_mkwrite);
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	493
				494	struct fiemap_ctx {
				495	struct fiemap_extent_info *fi;
				496	struct iomap prev;
				497	};
				498
				499	static int iomap_to_fiemap(struct fiemap_extent_info *fi,
				500	struct iomap *iomap, u32 flags)
				501	{
				502	switch (iomap->type) {
				503	case IOMAP_HOLE:
				504	/* skip holes */
				505	return 0;
				506	case IOMAP_DELALLOC:
				507	flags \|= FIEMAP_EXTENT_DELALLOC \| FIEMAP_EXTENT_UNKNOWN;
				508	break;
Christoph Hellwig	19319b5	2018-06-01 09:03:06 -0700	[diff] [blame]	509	case IOMAP_MAPPED:
				510	break;
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	511	case IOMAP_UNWRITTEN:
				512	flags \|= FIEMAP_EXTENT_UNWRITTEN;
				513	break;
Christoph Hellwig	19319b5	2018-06-01 09:03:06 -0700	[diff] [blame]	514	case IOMAP_INLINE:
				515	flags \|= FIEMAP_EXTENT_DATA_INLINE;
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	516	break;
				517	}
				518
Christoph Hellwig	17de0a9	2016-08-29 11:33:58 +1000	[diff] [blame]	519	if (iomap->flags & IOMAP_F_MERGED)
				520	flags \|= FIEMAP_EXTENT_MERGED;
Darrick J. Wong	e43c460	2016-09-19 10:13:02 +1000	[diff] [blame]	521	if (iomap->flags & IOMAP_F_SHARED)
				522	flags \|= FIEMAP_EXTENT_SHARED;
Christoph Hellwig	17de0a9	2016-08-29 11:33:58 +1000	[diff] [blame]	523
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	524	return fiemap_fill_next_extent(fi, iomap->offset,
Andreas Gruenbacher	19fe5f6	2017-10-01 17:55:54 -0400	[diff] [blame]	525	iomap->addr != IOMAP_NULL_ADDR ? iomap->addr : 0,
Christoph Hellwig	17de0a9	2016-08-29 11:33:58 +1000	[diff] [blame]	526	iomap->length, flags);
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	527	}
				528
				529	static loff_t
				530	iomap_fiemap_actor(struct inode inode, loff_t pos, loff_t length, void data,
				531	struct iomap *iomap)
				532	{
				533	struct fiemap_ctx *ctx = data;
				534	loff_t ret = length;
				535
				536	if (iomap->type == IOMAP_HOLE)
				537	return length;
				538
				539	ret = iomap_to_fiemap(ctx->fi, &ctx->prev, 0);
				540	ctx->prev = *iomap;
				541	switch (ret) {
				542	case 0: /* success */
				543	return length;
				544	case 1: /* extent array full */
				545	return 0;
				546	default:
				547	return ret;
				548	}
				549	}
				550
				551	int iomap_fiemap(struct inode inode, struct fiemap_extent_info fi,
Christoph Hellwig	8ff6daa	2017-01-27 23:20:26 -0800	[diff] [blame]	552	loff_t start, loff_t len, const struct iomap_ops *ops)
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	553	{
				554	struct fiemap_ctx ctx;
				555	loff_t ret;
				556
				557	memset(&ctx, 0, sizeof(ctx));
				558	ctx.fi = fi;
				559	ctx.prev.type = IOMAP_HOLE;
				560
				561	ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC);
				562	if (ret)
				563	return ret;
				564
Dave Chinner	8896b8f	2016-08-17 08:41:10 +1000	[diff] [blame]	565	if (fi->fi_flags & FIEMAP_FLAG_SYNC) {
				566	ret = filemap_write_and_wait(inode->i_mapping);
				567	if (ret)
				568	return ret;
				569	}
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	570
				571	while (len > 0) {
Christoph Hellwig	d33fd77	2016-10-20 15:51:28 +1100	[diff] [blame]	572	ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx,
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	573	iomap_fiemap_actor);
Dave Chinner	ac2dc05	2016-08-17 08:41:34 +1000	[diff] [blame]	574	/* inode with no (attribute) mapping will give ENOENT */
				575	if (ret == -ENOENT)
				576	break;
Christoph Hellwig	8be9f56	2016-06-21 09:38:45 +1000	[diff] [blame]	577	if (ret < 0)
				578	return ret;
				579	if (ret == 0)
				580	break;
				581
				582	start += ret;
				583	len -= ret;
				584	}
				585
				586	if (ctx.prev.type != IOMAP_HOLE) {
				587	ret = iomap_to_fiemap(fi, &ctx.prev, FIEMAP_EXTENT_LAST);
				588	if (ret < 0)
				589	return ret;
				590	}
				591
				592	return 0;
				593	}
				594	EXPORT_SYMBOL_GPL(iomap_fiemap);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	595
Christoph Hellwig	8a78cb1	2018-06-01 09:04:40 -0700	[diff] [blame]	596	/*
				597	* Seek for SEEK_DATA / SEEK_HOLE within @page, starting at @lastoff.
				598	*
				599	* Returns the offset within the file on success, and -ENOENT otherwise.
				600	*/
				601	static loff_t
				602	page_seek_hole_data(struct page *page, loff_t lastoff, int whence)
				603	{
				604	loff_t offset = page_offset(page);
				605	struct buffer_head bh, head;
				606	bool seek_data = whence == SEEK_DATA;
				607
				608	if (lastoff < offset)
				609	lastoff = offset;
				610
				611	bh = head = page_buffers(page);
				612	do {
				613	offset += bh->b_size;
				614	if (lastoff >= offset)
				615	continue;
				616
				617	/*
Christoph Hellwig	bd56b3e	2018-06-01 09:05:14 -0700	[diff] [blame^]	618	* Any buffer with valid data in it should have BH_Uptodate set.
Christoph Hellwig	8a78cb1	2018-06-01 09:04:40 -0700	[diff] [blame]	619	*/
Christoph Hellwig	bd56b3e	2018-06-01 09:05:14 -0700	[diff] [blame^]	620	if (buffer_uptodate(bh) == seek_data)
Christoph Hellwig	8a78cb1	2018-06-01 09:04:40 -0700	[diff] [blame]	621	return lastoff;
				622
				623	lastoff = offset;
				624	} while ((bh = bh->b_this_page) != head);
				625	return -ENOENT;
				626	}
				627
				628	/*
				629	* Seek for SEEK_DATA / SEEK_HOLE in the page cache.
				630	*
				631	* Within unwritten extents, the page cache determines which parts are holes
Christoph Hellwig	bd56b3e	2018-06-01 09:05:14 -0700	[diff] [blame^]	632	* and which are data: uptodate buffer heads count as data; everything else
				633	* counts as a hole.
Christoph Hellwig	8a78cb1	2018-06-01 09:04:40 -0700	[diff] [blame]	634	*
				635	* Returns the resulting offset on successs, and -ENOENT otherwise.
				636	*/
				637	static loff_t
				638	page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length,
				639	int whence)
				640	{
				641	pgoff_t index = offset >> PAGE_SHIFT;
				642	pgoff_t end = DIV_ROUND_UP(offset + length, PAGE_SIZE);
				643	loff_t lastoff = offset;
				644	struct pagevec pvec;
				645
				646	if (length <= 0)
				647	return -ENOENT;
				648
				649	pagevec_init(&pvec);
				650
				651	do {
				652	unsigned nr_pages, i;
				653
				654	nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &index,
				655	end - 1);
				656	if (nr_pages == 0)
				657	break;
				658
				659	for (i = 0; i < nr_pages; i++) {
				660	struct page *page = pvec.pages[i];
				661
				662	/*
				663	* At this point, the page may be truncated or
				664	* invalidated (changing page->mapping to NULL), or
				665	* even swizzled back from swapper_space to tmpfs file
				666	* mapping. However, page->index will not change
				667	* because we have a reference on the page.
				668	*
				669	* If current page offset is beyond where we've ended,
				670	* we've found a hole.
				671	*/
				672	if (whence == SEEK_HOLE &&
				673	lastoff < page_offset(page))
				674	goto check_range;
				675
				676	lock_page(page);
				677	if (likely(page->mapping == inode->i_mapping) &&
				678	page_has_buffers(page)) {
				679	lastoff = page_seek_hole_data(page, lastoff, whence);
				680	if (lastoff >= 0) {
				681	unlock_page(page);
				682	goto check_range;
				683	}
				684	}
				685	unlock_page(page);
				686	lastoff = page_offset(page) + PAGE_SIZE;
				687	}
				688	pagevec_release(&pvec);
				689	} while (index < end);
				690
				691	/* When no page at lastoff and we are not done, we found a hole. */
				692	if (whence != SEEK_HOLE)
				693	goto not_found;
				694
				695	check_range:
				696	if (lastoff < offset + length)
				697	goto out;
				698	not_found:
				699	lastoff = -ENOENT;
				700	out:
				701	pagevec_release(&pvec);
				702	return lastoff;
				703	}
				704
				705
Andreas Gruenbacher	0ed3b0d	2017-06-29 11:43:21 -0700	[diff] [blame]	706	static loff_t
				707	iomap_seek_hole_actor(struct inode *inode, loff_t offset, loff_t length,
				708	void data, struct iomap iomap)
				709	{
				710	switch (iomap->type) {
				711	case IOMAP_UNWRITTEN:
				712	offset = page_cache_seek_hole_data(inode, offset, length,
				713	SEEK_HOLE);
				714	if (offset < 0)
				715	return length;
				716	/* fall through */
				717	case IOMAP_HOLE:
				718	(loff_t )data = offset;
				719	return 0;
				720	default:
				721	return length;
				722	}
				723	}
				724
				725	loff_t
				726	iomap_seek_hole(struct inode inode, loff_t offset, const struct iomap_ops ops)
				727	{
				728	loff_t size = i_size_read(inode);
				729	loff_t length = size - offset;
				730	loff_t ret;
				731
Darrick J. Wong	d6ab17f	2017-07-12 10:26:47 -0700	[diff] [blame]	732	/* Nothing to be found before or beyond the end of the file. */
				733	if (offset < 0 \|\| offset >= size)
Andreas Gruenbacher	0ed3b0d	2017-06-29 11:43:21 -0700	[diff] [blame]	734	return -ENXIO;
				735
				736	while (length > 0) {
				737	ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
				738	&offset, iomap_seek_hole_actor);
				739	if (ret < 0)
				740	return ret;
				741	if (ret == 0)
				742	break;
				743
				744	offset += ret;
				745	length -= ret;
				746	}
				747
				748	return offset;
				749	}
				750	EXPORT_SYMBOL_GPL(iomap_seek_hole);
				751
				752	static loff_t
				753	iomap_seek_data_actor(struct inode *inode, loff_t offset, loff_t length,
				754	void data, struct iomap iomap)
				755	{
				756	switch (iomap->type) {
				757	case IOMAP_HOLE:
				758	return length;
				759	case IOMAP_UNWRITTEN:
				760	offset = page_cache_seek_hole_data(inode, offset, length,
				761	SEEK_DATA);
				762	if (offset < 0)
				763	return length;
				764	/FALLTHRU/
				765	default:
				766	(loff_t )data = offset;
				767	return 0;
				768	}
				769	}
				770
				771	loff_t
				772	iomap_seek_data(struct inode inode, loff_t offset, const struct iomap_ops ops)
				773	{
				774	loff_t size = i_size_read(inode);
				775	loff_t length = size - offset;
				776	loff_t ret;
				777
Darrick J. Wong	d6ab17f	2017-07-12 10:26:47 -0700	[diff] [blame]	778	/* Nothing to be found before or beyond the end of the file. */
				779	if (offset < 0 \|\| offset >= size)
Andreas Gruenbacher	0ed3b0d	2017-06-29 11:43:21 -0700	[diff] [blame]	780	return -ENXIO;
				781
				782	while (length > 0) {
				783	ret = iomap_apply(inode, offset, length, IOMAP_REPORT, ops,
				784	&offset, iomap_seek_data_actor);
				785	if (ret < 0)
				786	return ret;
				787	if (ret == 0)
				788	break;
				789
				790	offset += ret;
				791	length -= ret;
				792	}
				793
				794	if (length <= 0)
				795	return -ENXIO;
				796	return offset;
				797	}
				798	EXPORT_SYMBOL_GPL(iomap_seek_data);
				799
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	800	/*
				801	* Private flags for iomap_dio, must not overlap with the public ones in
				802	* iomap.h:
				803	*/
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	804	#define IOMAP_DIO_WRITE_FUA (1 << 28)
Dave Chinner	4f8ff44	2018-05-02 12:54:52 -0700	[diff] [blame]	805	#define IOMAP_DIO_NEED_SYNC (1 << 29)
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	806	#define IOMAP_DIO_WRITE (1 << 30)
				807	#define IOMAP_DIO_DIRTY (1 << 31)
				808
				809	struct iomap_dio {
				810	struct kiocb *iocb;
				811	iomap_dio_end_io_t *end_io;
				812	loff_t i_size;
				813	loff_t size;
				814	atomic_t ref;
				815	unsigned flags;
				816	int error;
				817
				818	union {
				819	/* used during submission and for synchronous completion: */
				820	struct {
				821	struct iov_iter *iter;
				822	struct task_struct *waiter;
				823	struct request_queue *last_queue;
				824	blk_qc_t cookie;
				825	} submit;
				826
				827	/* used for aio completion: */
				828	struct {
				829	struct work_struct work;
				830	} aio;
				831	};
				832	};
				833
				834	static ssize_t iomap_dio_complete(struct iomap_dio *dio)
				835	{
				836	struct kiocb *iocb = dio->iocb;
Lukas Czerner	332391a	2017-09-21 08:16:29 -0600	[diff] [blame]	837	struct inode *inode = file_inode(iocb->ki_filp);
Eryu Guan	5e25c26	2017-10-13 09:47:46 -0700	[diff] [blame]	838	loff_t offset = iocb->ki_pos;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	839	ssize_t ret;
				840
				841	if (dio->end_io) {
				842	ret = dio->end_io(iocb,
				843	dio->error ? dio->error : dio->size,
				844	dio->flags);
				845	} else {
				846	ret = dio->error;
				847	}
				848
				849	if (likely(!ret)) {
				850	ret = dio->size;
				851	/* check for short read */
Eryu Guan	5e25c26	2017-10-13 09:47:46 -0700	[diff] [blame]	852	if (offset + ret > dio->i_size &&
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	853	!(dio->flags & IOMAP_DIO_WRITE))
Eryu Guan	5e25c26	2017-10-13 09:47:46 -0700	[diff] [blame]	854	ret = dio->i_size - offset;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	855	iocb->ki_pos += ret;
				856	}
				857
Eryu Guan	5e25c26	2017-10-13 09:47:46 -0700	[diff] [blame]	858	/*
				859	* Try again to invalidate clean pages which might have been cached by
				860	* non-direct readahead, or faulted in by get_user_pages() if the source
				861	* of the write was an mmap'ed region of the file we're writing. Either
				862	* one is a pretty crazy thing to do, so we don't support it 100%. If
				863	* this invalidation fails, tough, the write still worked...
				864	*
				865	* And this page cache invalidation has to be after dio->end_io(), as
				866	* some filesystems convert unwritten extents to real allocations in
				867	* end_io() when necessary, otherwise a racing buffer read would cache
				868	* zeros from unwritten extents.
				869	*/
				870	if (!dio->error &&
				871	(dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
				872	int err;
				873	err = invalidate_inode_pages2_range(inode->i_mapping,
				874	offset >> PAGE_SHIFT,
				875	(offset + dio->size - 1) >> PAGE_SHIFT);
Darrick J. Wong	5a9d929	2018-01-08 10:41:39 -0800	[diff] [blame]	876	if (err)
				877	dio_warn_stale_pagecache(iocb->ki_filp);
Eryu Guan	5e25c26	2017-10-13 09:47:46 -0700	[diff] [blame]	878	}
				879
Dave Chinner	4f8ff44	2018-05-02 12:54:52 -0700	[diff] [blame]	880	/*
				881	* If this is a DSYNC write, make sure we push it to stable storage now
				882	* that we've written data.
				883	*/
				884	if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
				885	ret = generic_write_sync(iocb, ret);
				886
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	887	inode_dio_end(file_inode(iocb->ki_filp));
				888	kfree(dio);
				889
				890	return ret;
				891	}
				892
				893	static void iomap_dio_complete_work(struct work_struct *work)
				894	{
				895	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
				896	struct kiocb *iocb = dio->iocb;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	897
Dave Chinner	4f8ff44	2018-05-02 12:54:52 -0700	[diff] [blame]	898	iocb->ki_complete(iocb, iomap_dio_complete(dio), 0);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	899	}
				900
				901	/*
				902	* Set an error in the dio if none is set yet. We have to use cmpxchg
				903	* as the submission context and the completion context(s) can race to
				904	* update the error.
				905	*/
				906	static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
				907	{
				908	cmpxchg(&dio->error, 0, ret);
				909	}
				910
				911	static void iomap_dio_bio_end_io(struct bio *bio)
				912	{
				913	struct iomap_dio *dio = bio->bi_private;
				914	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
				915
Christoph Hellwig	4e4cbee	2017-06-03 09:38:06 +0200	[diff] [blame]	916	if (bio->bi_status)
				917	iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status));
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	918
				919	if (atomic_dec_and_test(&dio->ref)) {
				920	if (is_sync_kiocb(dio->iocb)) {
				921	struct task_struct *waiter = dio->submit.waiter;
				922
				923	WRITE_ONCE(dio->submit.waiter, NULL);
				924	wake_up_process(waiter);
				925	} else if (dio->flags & IOMAP_DIO_WRITE) {
				926	struct inode *inode = file_inode(dio->iocb->ki_filp);
				927
				928	INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
				929	queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
				930	} else {
				931	iomap_dio_complete_work(&dio->aio.work);
				932	}
				933	}
				934
				935	if (should_dirty) {
				936	bio_check_pages_dirty(bio);
				937	} else {
				938	struct bio_vec *bvec;
				939	int i;
				940
				941	bio_for_each_segment_all(bvec, bio, i)
				942	put_page(bvec->bv_page);
				943	bio_put(bio);
				944	}
				945	}
				946
				947	static blk_qc_t
				948	iomap_dio_zero(struct iomap_dio dio, struct iomap iomap, loff_t pos,
				949	unsigned len)
				950	{
				951	struct page *page = ZERO_PAGE(0);
				952	struct bio *bio;
				953
				954	bio = bio_alloc(GFP_KERNEL, 1);
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	955	bio_set_dev(bio, iomap->bdev);
Christoph Hellwig	57fc505	2018-06-01 09:03:08 -0700	[diff] [blame]	956	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	957	bio->bi_private = dio;
				958	bio->bi_end_io = iomap_dio_bio_end_io;
				959
				960	get_page(page);
Christoph Hellwig	6533b4e	2018-06-01 09:03:07 -0700	[diff] [blame]	961	__bio_add_page(bio, page, len, 0);
Linus Torvalds	5cc60ae	2016-12-14 21:35:31 -0800	[diff] [blame]	962	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC \| REQ_IDLE);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	963
				964	atomic_inc(&dio->ref);
				965	return submit_bio(bio);
				966	}
				967
				968	static loff_t
				969	iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
				970	void data, struct iomap iomap)
				971	{
				972	struct iomap_dio *dio = data;
Fabian Frederick	9340747	2017-02-27 14:28:32 -0800	[diff] [blame]	973	unsigned int blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
				974	unsigned int fs_block_size = i_blocksize(inode), pad;
				975	unsigned int align = iov_iter_alignment(dio->submit.iter);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	976	struct iov_iter iter;
				977	struct bio *bio;
				978	bool need_zeroout = false;
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	979	bool use_fua = false;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	980	int nr_pages, ret;
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	981	size_t copied = 0;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	982
				983	if ((pos \| length \| align) & ((1 << blkbits) - 1))
				984	return -EINVAL;
				985
				986	switch (iomap->type) {
				987	case IOMAP_HOLE:
				988	if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
				989	return -EIO;
				990	/FALLTHRU/
				991	case IOMAP_UNWRITTEN:
				992	if (!(dio->flags & IOMAP_DIO_WRITE)) {
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	993	length = iov_iter_zero(length, dio->submit.iter);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	994	dio->size += length;
				995	return length;
				996	}
				997	dio->flags \|= IOMAP_DIO_UNWRITTEN;
				998	need_zeroout = true;
				999	break;
				1000	case IOMAP_MAPPED:
				1001	if (iomap->flags & IOMAP_F_SHARED)
				1002	dio->flags \|= IOMAP_DIO_COW;
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1003	if (iomap->flags & IOMAP_F_NEW) {
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1004	need_zeroout = true;
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1005	} else {
				1006	/*
				1007	* Use a FUA write if we need datasync semantics, this
				1008	* is a pure data IO that doesn't require any metadata
				1009	* updates and the underlying device supports FUA. This
				1010	* allows us to avoid cache flushes on IO completion.
				1011	*/
				1012	if (!(iomap->flags & (IOMAP_F_SHARED\|IOMAP_F_DIRTY)) &&
				1013	(dio->flags & IOMAP_DIO_WRITE_FUA) &&
				1014	blk_queue_fua(bdev_get_queue(iomap->bdev)))
				1015	use_fua = true;
				1016	}
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1017	break;
				1018	default:
				1019	WARN_ON_ONCE(1);
				1020	return -EIO;
				1021	}
				1022
				1023	/*
				1024	* Operate on a partial iter trimmed to the extent we were called for.
				1025	* We'll update the iter in the dio once we're done with this extent.
				1026	*/
				1027	iter = *dio->submit.iter;
				1028	iov_iter_truncate(&iter, length);
				1029
				1030	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
				1031	if (nr_pages <= 0)
				1032	return nr_pages;
				1033
				1034	if (need_zeroout) {
				1035	/* zero out from the start of the block to the write offset */
				1036	pad = pos & (fs_block_size - 1);
				1037	if (pad)
				1038	iomap_dio_zero(dio, iomap, pos - pad, pad);
				1039	}
				1040
				1041	do {
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	1042	size_t n;
				1043	if (dio->error) {
				1044	iov_iter_revert(dio->submit.iter, copied);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1045	return 0;
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	1046	}
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1047
				1048	bio = bio_alloc(GFP_KERNEL, nr_pages);
Christoph Hellwig	74d4699	2017-08-23 19:10:32 +0200	[diff] [blame]	1049	bio_set_dev(bio, iomap->bdev);
Christoph Hellwig	57fc505	2018-06-01 09:03:08 -0700	[diff] [blame]	1050	bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
Jens Axboe	45d06cf	2017-06-27 11:01:22 -0600	[diff] [blame]	1051	bio->bi_write_hint = dio->iocb->ki_hint;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1052	bio->bi_private = dio;
				1053	bio->bi_end_io = iomap_dio_bio_end_io;
				1054
				1055	ret = bio_iov_iter_get_pages(bio, &iter);
				1056	if (unlikely(ret)) {
				1057	bio_put(bio);
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	1058	return copied ? copied : ret;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1059	}
				1060
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	1061	n = bio->bi_iter.bi_size;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1062	if (dio->flags & IOMAP_DIO_WRITE) {
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1063	bio->bi_opf = REQ_OP_WRITE \| REQ_SYNC \| REQ_IDLE;
				1064	if (use_fua)
				1065	bio->bi_opf \|= REQ_FUA;
				1066	else
				1067	dio->flags &= ~IOMAP_DIO_WRITE_FUA;
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	1068	task_io_account_write(n);
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1069	} else {
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1070	bio->bi_opf = REQ_OP_READ;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1071	if (dio->flags & IOMAP_DIO_DIRTY)
				1072	bio_set_pages_dirty(bio);
				1073	}
				1074
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	1075	iov_iter_advance(dio->submit.iter, n);
				1076
				1077	dio->size += n;
				1078	pos += n;
				1079	copied += n;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1080
				1081	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
				1082
				1083	atomic_inc(&dio->ref);
				1084
				1085	dio->submit.last_queue = bdev_get_queue(iomap->bdev);
				1086	dio->submit.cookie = submit_bio(bio);
				1087	} while (nr_pages);
				1088
				1089	if (need_zeroout) {
				1090	/* zero out from the end of the write to the end of the block */
				1091	pad = pos & (fs_block_size - 1);
				1092	if (pad)
				1093	iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
				1094	}
Al Viro	cfe057f	2017-09-11 21:17:09 +0100	[diff] [blame]	1095	return copied;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1096	}
				1097
Dave Chinner	4f8ff44	2018-05-02 12:54:52 -0700	[diff] [blame]	1098	/*
				1099	* iomap_dio_rw() always completes O_[D]SYNC writes regardless of whether the IO
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1100	* is being issued as AIO or not. This allows us to optimise pure data writes
				1101	* to use REQ_FUA rather than requiring generic_write_sync() to issue a
				1102	* REQ_FLUSH post write. This is slightly tricky because a single request here
				1103	* can be mapped into multiple disjoint IOs and only a subset of the IOs issued
				1104	* may be pure data writes. In that case, we still need to do a full data sync
				1105	* completion.
Dave Chinner	4f8ff44	2018-05-02 12:54:52 -0700	[diff] [blame]	1106	*/
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1107	ssize_t
Christoph Hellwig	8ff6daa	2017-01-27 23:20:26 -0800	[diff] [blame]	1108	iomap_dio_rw(struct kiocb iocb, struct iov_iter iter,
				1109	const struct iomap_ops *ops, iomap_dio_end_io_t end_io)
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1110	{
				1111	struct address_space *mapping = iocb->ki_filp->f_mapping;
				1112	struct inode *inode = file_inode(iocb->ki_filp);
				1113	size_t count = iov_iter_count(iter);
Eryu Guan	c771c14	2017-03-02 15:02:06 -0800	[diff] [blame]	1114	loff_t pos = iocb->ki_pos, start = pos;
				1115	loff_t end = iocb->ki_pos + count - 1, ret = 0;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1116	unsigned int flags = IOMAP_DIRECT;
				1117	struct blk_plug plug;
				1118	struct iomap_dio *dio;
				1119
				1120	lockdep_assert_held(&inode->i_rwsem);
				1121
				1122	if (!count)
				1123	return 0;
				1124
				1125	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
				1126	if (!dio)
				1127	return -ENOMEM;
				1128
				1129	dio->iocb = iocb;
				1130	atomic_set(&dio->ref, 1);
				1131	dio->size = 0;
				1132	dio->i_size = i_size_read(inode);
				1133	dio->end_io = end_io;
				1134	dio->error = 0;
				1135	dio->flags = 0;
				1136
				1137	dio->submit.iter = iter;
				1138	if (is_sync_kiocb(iocb)) {
				1139	dio->submit.waiter = current;
				1140	dio->submit.cookie = BLK_QC_T_NONE;
				1141	dio->submit.last_queue = NULL;
				1142	}
				1143
				1144	if (iov_iter_rw(iter) == READ) {
				1145	if (pos >= dio->i_size)
				1146	goto out_free_dio;
				1147
				1148	if (iter->type == ITER_IOVEC)
				1149	dio->flags \|= IOMAP_DIO_DIRTY;
				1150	} else {
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1151	flags \|= IOMAP_WRITE;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1152	dio->flags \|= IOMAP_DIO_WRITE;
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1153
				1154	/* for data sync or sync, we need sync completion processing */
Dave Chinner	4f8ff44	2018-05-02 12:54:52 -0700	[diff] [blame]	1155	if (iocb->ki_flags & IOCB_DSYNC)
				1156	dio->flags \|= IOMAP_DIO_NEED_SYNC;
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1157
				1158	/*
				1159	* For datasync only writes, we optimistically try using FUA for
				1160	* this IO. Any non-FUA write that occurs will clear this flag,
				1161	* hence we know before completion whether a cache flush is
				1162	* necessary.
				1163	*/
				1164	if ((iocb->ki_flags & (IOCB_DSYNC \| IOCB_SYNC)) == IOCB_DSYNC)
				1165	dio->flags \|= IOMAP_DIO_WRITE_FUA;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1166	}
				1167
Goldwyn Rodrigues	a38d124	2017-06-20 07:05:45 -0500	[diff] [blame]	1168	if (iocb->ki_flags & IOCB_NOWAIT) {
				1169	if (filemap_range_has_page(mapping, start, end)) {
				1170	ret = -EAGAIN;
				1171	goto out_free_dio;
				1172	}
				1173	flags \|= IOMAP_NOWAIT;
				1174	}
				1175
Andrey Ryabinin	55635ba	2017-05-03 14:55:59 -0700	[diff] [blame]	1176	ret = filemap_write_and_wait_range(mapping, start, end);
				1177	if (ret)
				1178	goto out_free_dio;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1179
Darrick J. Wong	5a9d929	2018-01-08 10:41:39 -0800	[diff] [blame]	1180	/*
				1181	* Try to invalidate cache pages for the range we're direct
				1182	* writing. If this invalidation fails, tough, the write will
				1183	* still work, but racing two incompatible write paths is a
				1184	* pretty crazy thing to do, so we don't support it 100%.
				1185	*/
Andrey Ryabinin	55635ba	2017-05-03 14:55:59 -0700	[diff] [blame]	1186	ret = invalidate_inode_pages2_range(mapping,
				1187	start >> PAGE_SHIFT, end >> PAGE_SHIFT);
Darrick J. Wong	5a9d929	2018-01-08 10:41:39 -0800	[diff] [blame]	1188	if (ret)
				1189	dio_warn_stale_pagecache(iocb->ki_filp);
Andrey Ryabinin	55635ba	2017-05-03 14:55:59 -0700	[diff] [blame]	1190	ret = 0;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1191
Chandan Rajendra	546e7be	2017-09-22 11:47:33 -0700	[diff] [blame]	1192	if (iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
				1193	!inode->i_sb->s_dio_done_wq) {
				1194	ret = sb_init_dio_done_wq(inode->i_sb);
				1195	if (ret < 0)
				1196	goto out_free_dio;
				1197	}
				1198
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1199	inode_dio_begin(inode);
				1200
				1201	blk_start_plug(&plug);
				1202	do {
				1203	ret = iomap_apply(inode, pos, count, flags, ops, dio,
				1204	iomap_dio_actor);
				1205	if (ret <= 0) {
				1206	/* magic error code to fall back to buffered I/O */
				1207	if (ret == -ENOTBLK)
				1208	ret = 0;
				1209	break;
				1210	}
				1211	pos += ret;
Chandan Rajendra	a008c31	2017-04-12 11:03:20 -0700	[diff] [blame]	1212
				1213	if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
				1214	break;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1215	} while ((count = iov_iter_count(iter)) > 0);
				1216	blk_finish_plug(&plug);
				1217
				1218	if (ret < 0)
				1219	iomap_dio_set_error(dio, ret);
				1220
Dave Chinner	3460cac	2018-05-02 12:54:53 -0700	[diff] [blame]	1221	/*
				1222	* If all the writes we issued were FUA, we don't need to flush the
				1223	* cache on IO completion. Clear the sync flag for this case.
				1224	*/
				1225	if (dio->flags & IOMAP_DIO_WRITE_FUA)
				1226	dio->flags &= ~IOMAP_DIO_NEED_SYNC;
				1227
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1228	if (!atomic_dec_and_test(&dio->ref)) {
				1229	if (!is_sync_kiocb(iocb))
				1230	return -EIOCBQUEUED;
				1231
				1232	for (;;) {
				1233	set_current_state(TASK_UNINTERRUPTIBLE);
				1234	if (!READ_ONCE(dio->submit.waiter))
				1235	break;
				1236
				1237	if (!(iocb->ki_flags & IOCB_HIPRI) \|\|
				1238	!dio->submit.last_queue \|\|
Christoph Hellwig	ea435e1	2017-11-02 21:29:54 +0300	[diff] [blame]	1239	!blk_poll(dio->submit.last_queue,
Linus Torvalds	5cc60ae	2016-12-14 21:35:31 -0800	[diff] [blame]	1240	dio->submit.cookie))
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1241	io_schedule();
				1242	}
				1243	__set_current_state(TASK_RUNNING);
				1244	}
				1245
Eryu Guan	c771c14	2017-03-02 15:02:06 -0800	[diff] [blame]	1246	ret = iomap_dio_complete(dio);
				1247
Eryu Guan	c771c14	2017-03-02 15:02:06 -0800	[diff] [blame]	1248	return ret;
Christoph Hellwig	ff6a929	2016-11-30 14:36:01 +1100	[diff] [blame]	1249
				1250	out_free_dio:
				1251	kfree(dio);
				1252	return ret;
				1253	}
				1254	EXPORT_SYMBOL_GPL(iomap_dio_rw);
Darrick J. Wong	6748212	2018-05-10 08:38:15 -0700	[diff] [blame]	1255
				1256	/* Swapfile activation */
				1257
				1258	#ifdef CONFIG_SWAP
				1259	struct iomap_swapfile_info {
				1260	struct iomap iomap; /* accumulated iomap */
				1261	struct swap_info_struct *sis;
				1262	uint64_t lowest_ppage; /* lowest physical addr seen (pages) */
				1263	uint64_t highest_ppage; /* highest physical addr seen (pages) */
				1264	unsigned long nr_pages; /* number of pages collected */
				1265	int nr_extents; /* extent count */
				1266	};
				1267
				1268	/*
				1269	* Collect physical extents for this swap file. Physical extents reported to
				1270	* the swap code must be trimmed to align to a page boundary. The logical
				1271	* offset within the file is irrelevant since the swapfile code maps logical
				1272	* page numbers of the swap device to the physical page-aligned extents.
				1273	*/
				1274	static int iomap_swapfile_add_extent(struct iomap_swapfile_info *isi)
				1275	{
				1276	struct iomap *iomap = &isi->iomap;
				1277	unsigned long nr_pages;
				1278	uint64_t first_ppage;
				1279	uint64_t first_ppage_reported;
				1280	uint64_t next_ppage;
				1281	int error;
				1282
				1283	/*
				1284	* Round the start up and the end down so that the physical
				1285	* extent aligns to a page boundary.
				1286	*/
				1287	first_ppage = ALIGN(iomap->addr, PAGE_SIZE) >> PAGE_SHIFT;
				1288	next_ppage = ALIGN_DOWN(iomap->addr + iomap->length, PAGE_SIZE) >>
				1289	PAGE_SHIFT;
				1290
				1291	/* Skip too-short physical extents. */
				1292	if (first_ppage >= next_ppage)
				1293	return 0;
				1294	nr_pages = next_ppage - first_ppage;
				1295
				1296	/*
				1297	* Calculate how much swap space we're adding; the first page contains
				1298	* the swap header and doesn't count. The mm still wants that first
				1299	* page fed to add_swap_extent, however.
				1300	*/
				1301	first_ppage_reported = first_ppage;
				1302	if (iomap->offset == 0)
				1303	first_ppage_reported++;
				1304	if (isi->lowest_ppage > first_ppage_reported)
				1305	isi->lowest_ppage = first_ppage_reported;
				1306	if (isi->highest_ppage < (next_ppage - 1))
				1307	isi->highest_ppage = next_ppage - 1;
				1308
				1309	/* Add extent, set up for the next call. */
				1310	error = add_swap_extent(isi->sis, isi->nr_pages, nr_pages, first_ppage);
				1311	if (error < 0)
				1312	return error;
				1313	isi->nr_extents += error;
				1314	isi->nr_pages += nr_pages;
				1315	return 0;
				1316	}
				1317
				1318	/*
				1319	* Accumulate iomaps for this swap file. We have to accumulate iomaps because
				1320	* swap only cares about contiguous page-aligned physical extents and makes no
				1321	* distinction between written and unwritten extents.
				1322	*/
				1323	static loff_t iomap_swapfile_activate_actor(struct inode *inode, loff_t pos,
				1324	loff_t count, void data, struct iomap iomap)
				1325	{
				1326	struct iomap_swapfile_info *isi = data;
				1327	int error;
				1328
Christoph Hellwig	19319b5	2018-06-01 09:03:06 -0700	[diff] [blame]	1329	switch (iomap->type) {
				1330	case IOMAP_MAPPED:
				1331	case IOMAP_UNWRITTEN:
				1332	/* Only real or unwritten extents. */
				1333	break;
				1334	case IOMAP_INLINE:
				1335	/* No inline data. */
Omar Sandoval	ec60192	2018-05-16 11:13:34 -0700	[diff] [blame]	1336	pr_err("swapon: file is inline\n");
				1337	return -EINVAL;
Christoph Hellwig	19319b5	2018-06-01 09:03:06 -0700	[diff] [blame]	1338	default:
Omar Sandoval	ec60192	2018-05-16 11:13:34 -0700	[diff] [blame]	1339	pr_err("swapon: file has unallocated extents\n");
				1340	return -EINVAL;
				1341	}
Darrick J. Wong	6748212	2018-05-10 08:38:15 -0700	[diff] [blame]	1342
Omar Sandoval	ec60192	2018-05-16 11:13:34 -0700	[diff] [blame]	1343	/* No uncommitted metadata or shared blocks. */
				1344	if (iomap->flags & IOMAP_F_DIRTY) {
				1345	pr_err("swapon: file is not committed\n");
				1346	return -EINVAL;
				1347	}
				1348	if (iomap->flags & IOMAP_F_SHARED) {
				1349	pr_err("swapon: file has shared extents\n");
				1350	return -EINVAL;
				1351	}
Darrick J. Wong	6748212	2018-05-10 08:38:15 -0700	[diff] [blame]	1352
Omar Sandoval	ec60192	2018-05-16 11:13:34 -0700	[diff] [blame]	1353	/* Only one bdev per swap file. */
				1354	if (iomap->bdev != isi->sis->bdev) {
				1355	pr_err("swapon: file is on multiple devices\n");
				1356	return -EINVAL;
				1357	}
Darrick J. Wong	6748212	2018-05-10 08:38:15 -0700	[diff] [blame]	1358
				1359	if (isi->iomap.length == 0) {
				1360	/* No accumulated extent, so just store it. */
				1361	memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
				1362	} else if (isi->iomap.addr + isi->iomap.length == iomap->addr) {
				1363	/* Append this to the accumulated extent. */
				1364	isi->iomap.length += iomap->length;
				1365	} else {
				1366	/* Otherwise, add the retained iomap and store this one. */
				1367	error = iomap_swapfile_add_extent(isi);
				1368	if (error)
				1369	return error;
				1370	memcpy(&isi->iomap, iomap, sizeof(isi->iomap));
				1371	}
Darrick J. Wong	6748212	2018-05-10 08:38:15 -0700	[diff] [blame]	1372	return count;
Darrick J. Wong	6748212	2018-05-10 08:38:15 -0700	[diff] [blame]	1373	}
				1374
				1375	/*
				1376	* Iterate a swap file's iomaps to construct physical extents that can be
				1377	* passed to the swapfile subsystem.
				1378	*/
				1379	int iomap_swapfile_activate(struct swap_info_struct *sis,
				1380	struct file swap_file, sector_t pagespan,
				1381	const struct iomap_ops *ops)
				1382	{
				1383	struct iomap_swapfile_info isi = {
				1384	.sis = sis,
				1385	.lowest_ppage = (sector_t)-1ULL,
				1386	};
				1387	struct address_space *mapping = swap_file->f_mapping;
				1388	struct inode *inode = mapping->host;
				1389	loff_t pos = 0;
				1390	loff_t len = ALIGN_DOWN(i_size_read(inode), PAGE_SIZE);
				1391	loff_t ret;
				1392
				1393	ret = filemap_write_and_wait(inode->i_mapping);
				1394	if (ret)
				1395	return ret;
				1396
				1397	while (len > 0) {
				1398	ret = iomap_apply(inode, pos, len, IOMAP_REPORT,
				1399	ops, &isi, iomap_swapfile_activate_actor);
				1400	if (ret <= 0)
				1401	return ret;
				1402
				1403	pos += ret;
				1404	len -= ret;
				1405	}
				1406
				1407	if (isi.iomap.length) {
				1408	ret = iomap_swapfile_add_extent(&isi);
				1409	if (ret)
				1410	return ret;
				1411	}
				1412
				1413	*pagespan = 1 + isi.highest_ppage - isi.lowest_ppage;
				1414	sis->max = isi.nr_pages;
				1415	sis->pages = isi.nr_pages - 1;
				1416	sis->highest_bit = isi.nr_pages - 1;
				1417	return isi.nr_extents;
				1418	}
				1419	EXPORT_SYMBOL_GPL(iomap_swapfile_activate);
				1420	#endif /* CONFIG_SWAP */
Christoph Hellwig	89eb190	2018-06-01 09:03:08 -0700	[diff] [blame]	1421
				1422	static loff_t
				1423	iomap_bmap_actor(struct inode *inode, loff_t pos, loff_t length,
				1424	void data, struct iomap iomap)
				1425	{
				1426	sector_t *bno = data, addr;
				1427
				1428	if (iomap->type == IOMAP_MAPPED) {
				1429	addr = (pos - iomap->offset + iomap->addr) >> inode->i_blkbits;
				1430	if (addr > INT_MAX)
				1431	WARN(1, "would truncate bmap result\n");
				1432	else
				1433	*bno = addr;
				1434	}
				1435	return 0;
				1436	}
				1437
				1438	/* legacy ->bmap interface. 0 is the error return (!) */
				1439	sector_t
				1440	iomap_bmap(struct address_space *mapping, sector_t bno,
				1441	const struct iomap_ops *ops)
				1442	{
				1443	struct inode *inode = mapping->host;
				1444	loff_t pos = bno >> inode->i_blkbits;
				1445	unsigned blocksize = i_blocksize(inode);
				1446
				1447	if (filemap_write_and_wait(mapping))
				1448	return 0;
				1449
				1450	bno = 0;
				1451	iomap_apply(inode, pos, blocksize, 0, ops, &bno, iomap_bmap_actor);
				1452	return bno;
				1453	}
				1454	EXPORT_SYMBOL_GPL(iomap_bmap);