Blame - fs/btrfs/reflink.c - SHIFTPHONES/mainline/linux

blob: 829adbb508372440ea2b665694eab0032ec7f661 [file] [log] [blame]

Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
				3	#include <linux/iversion.h>
				4	#include "ctree.h"
				5	#include "reflink.h"
				6	#include "transaction.h"
				7
				8	#define BTRFS_MAX_DEDUPE_LEN SZ_16M
				9
				10	static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
				11	struct inode *inode,
				12	u64 endoff,
				13	const u64 destoff,
				14	const u64 olen,
				15	int no_time_update)
				16	{
				17	struct btrfs_root *root = BTRFS_I(inode)->root;
				18	int ret;
				19
				20	inode_inc_iversion(inode);
				21	if (!no_time_update)
				22	inode->i_mtime = inode->i_ctime = current_time(inode);
				23	/*
				24	* We round up to the block size at eof when determining which
				25	* extents to clone above, but shouldn't round up the file size.
				26	*/
				27	if (endoff > destoff + olen)
				28	endoff = destoff + olen;
				29	if (endoff > inode->i_size) {
				30	i_size_write(inode, endoff);
				31	btrfs_inode_safe_disk_i_size_write(inode, 0);
				32	}
				33
				34	ret = btrfs_update_inode(trans, root, inode);
				35	if (ret) {
				36	btrfs_abort_transaction(trans, ret);
				37	btrfs_end_transaction(trans);
				38	goto out;
				39	}
				40	ret = btrfs_end_transaction(trans);
				41	out:
				42	return ret;
				43	}
				44
				45	/*
				46	* Make sure we do not end up inserting an inline extent into a file that has
				47	* already other (non-inline) extents. If a file has an inline extent it can
				48	* not have any other extents and the (single) inline extent must start at the
				49	* file offset 0. Failing to respect these rules will lead to file corruption,
				50	* resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
				51	*
				52	* We can have extents that have been already written to disk or we can have
				53	* dirty ranges still in delalloc, in which case the extent maps and items are
				54	* created only when we run delalloc, and the delalloc ranges might fall outside
				55	* the range we are currently locking in the inode's io tree. So we check the
				56	* inode's i_size because of that (i_size updates are done while holding the
				57	* i_mutex, which we are holding here).
				58	* We also check to see if the inode has a size not greater than "datal" but has
				59	* extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
				60	* protected against such concurrent fallocate calls by the i_mutex).
				61	*
				62	* If the file has no extents but a size greater than datal, do not allow the
				63	* copy because we would need turn the inline extent into a non-inline one (even
				64	* with NO_HOLES enabled). If we find our destination inode only has one inline
				65	* extent, just overwrite it with the source inline extent if its size is less
				66	* than the source extent's size, or we could copy the source inline extent's
				67	* data into the destination inode's inline extent if the later is greater then
				68	* the former.
				69	*/
				70	static int clone_copy_inline_extent(struct inode *dst,
				71	struct btrfs_trans_handle *trans,
				72	struct btrfs_path *path,
				73	struct btrfs_key *new_key,
				74	const u64 drop_start,
				75	const u64 datal,
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	76	const u64 size,
Filipe Manana	a61e1e0	2020-02-28 13:04:18 +0000	[diff] [blame^]	77	const char *inline_data)
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	78	{
				79	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
				80	struct btrfs_root *root = BTRFS_I(dst)->root;
				81	const u64 aligned_end = ALIGN(new_key->offset + datal,
				82	fs_info->sectorsize);
				83	int ret;
				84	struct btrfs_key key;
				85
				86	if (new_key->offset > 0)
				87	return -EOPNOTSUPP;
				88
				89	key.objectid = btrfs_ino(BTRFS_I(dst));
				90	key.type = BTRFS_EXTENT_DATA_KEY;
				91	key.offset = 0;
				92	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				93	if (ret < 0) {
				94	return ret;
				95	} else if (ret > 0) {
				96	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				97	ret = btrfs_next_leaf(root, path);
				98	if (ret < 0)
				99	return ret;
				100	else if (ret > 0)
				101	goto copy_inline_extent;
				102	}
				103	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				104	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				105	key.type == BTRFS_EXTENT_DATA_KEY) {
				106	ASSERT(key.offset > 0);
				107	return -EOPNOTSUPP;
				108	}
				109	} else if (i_size_read(dst) <= datal) {
				110	struct btrfs_file_extent_item *ei;
				111	u64 ext_len;
				112
				113	/*
				114	* If the file size is <= datal, make sure there are no other
				115	* extents following (can happen do to an fallocate call with
				116	* the flag FALLOC_FL_KEEP_SIZE).
				117	*/
				118	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				119	struct btrfs_file_extent_item);
				120	/*
				121	* If it's an inline extent, it can not have other extents
				122	* following it.
				123	*/
				124	if (btrfs_file_extent_type(path->nodes[0], ei) ==
				125	BTRFS_FILE_EXTENT_INLINE)
				126	goto copy_inline_extent;
				127
				128	ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
				129	if (ext_len > aligned_end)
				130	return -EOPNOTSUPP;
				131
				132	ret = btrfs_next_item(root, path);
				133	if (ret < 0) {
				134	return ret;
				135	} else if (ret == 0) {
				136	btrfs_item_key_to_cpu(path->nodes[0], &key,
				137	path->slots[0]);
				138	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				139	key.type == BTRFS_EXTENT_DATA_KEY)
				140	return -EOPNOTSUPP;
				141	}
				142	}
				143
				144	copy_inline_extent:
				145	/*
				146	* We have no extent items, or we have an extent at offset 0 which may
				147	* or may not be inlined. All these cases are dealt the same way.
				148	*/
				149	if (i_size_read(dst) > datal) {
				150	/*
				151	* If the destination inode has an inline extent.
				152	* This would require copying the data from the source inline
				153	* extent into the beginning of the destination's inline extent.
				154	* But this is really complex, both extents can be compressed
				155	* or just one of them, which would require decompressing and
				156	* re-compressing data (which could increase the new compressed
				157	* size, not allowing the compressed data to fit anymore in an
				158	* inline extent).
				159	* So just don't support this case for now (it should be rare,
				160	* we are not really saving space when cloning inline extents).
				161	*/
				162	return -EOPNOTSUPP;
				163	}
				164
				165	btrfs_release_path(path);
				166	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
				167	if (ret)
				168	return ret;
				169	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
				170	if (ret)
				171	return ret;
				172
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	173	write_extent_buffer(path->nodes[0], inline_data,
				174	btrfs_item_ptr_offset(path->nodes[0],
				175	path->slots[0]),
				176	size);
				177	inode_add_bytes(dst, datal);
				178	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
				179
				180	return 0;
				181	}
				182
				183	/**
				184	* btrfs_clone() - clone a range from inode file to another
				185	*
				186	* @src: Inode to clone from
				187	* @inode: Inode to clone to
				188	* @off: Offset within source to start clone from
				189	* @olen: Original length, passed by user, of range to clone
				190	* @olen_aligned: Block-aligned value of olen
				191	* @destoff: Offset within @inode to start clone
				192	* @no_time_update: Whether to update mtime/ctime on the target inode
				193	*/
				194	static int btrfs_clone(struct inode src, struct inode inode,
				195	const u64 off, const u64 olen, const u64 olen_aligned,
				196	const u64 destoff, int no_time_update)
				197	{
				198	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				199	struct btrfs_root *root = BTRFS_I(inode)->root;
				200	struct btrfs_path *path = NULL;
				201	struct extent_buffer *leaf;
				202	struct btrfs_trans_handle *trans;
				203	char *buf = NULL;
				204	struct btrfs_key key;
				205	u32 nritems;
				206	int slot;
				207	int ret;
				208	const u64 len = olen_aligned;
				209	u64 last_dest_end = destoff;
				210
				211	ret = -ENOMEM;
				212	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
				213	if (!buf)
				214	return ret;
				215
				216	path = btrfs_alloc_path();
				217	if (!path) {
				218	kvfree(buf);
				219	return ret;
				220	}
				221
				222	path->reada = READA_FORWARD;
				223	/* Clone data */
				224	key.objectid = btrfs_ino(BTRFS_I(src));
				225	key.type = BTRFS_EXTENT_DATA_KEY;
				226	key.offset = off;
				227
				228	while (1) {
				229	u64 next_key_min_offset = key.offset + 1;
				230	struct btrfs_file_extent_item *extent;
				231	int type;
				232	u32 size;
				233	struct btrfs_key new_key;
				234	u64 disko = 0, diskl = 0;
				235	u64 datao = 0, datal = 0;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	236	u64 drop_start;
				237
				238	/* Note the key will change type as we walk through the tree */
				239	path->leave_spinning = 1;
				240	ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
				241	0, 0);
				242	if (ret < 0)
				243	goto out;
				244	/*
				245	* First search, if no extent item that starts at offset off was
				246	* found but the previous item is an extent item, it's possible
				247	* it might overlap our target range, therefore process it.
				248	*/
				249	if (key.offset == off && ret > 0 && path->slots[0] > 0) {
				250	btrfs_item_key_to_cpu(path->nodes[0], &key,
				251	path->slots[0] - 1);
				252	if (key.type == BTRFS_EXTENT_DATA_KEY)
				253	path->slots[0]--;
				254	}
				255
				256	nritems = btrfs_header_nritems(path->nodes[0]);
				257	process_slot:
				258	if (path->slots[0] >= nritems) {
				259	ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
				260	if (ret < 0)
				261	goto out;
				262	if (ret > 0)
				263	break;
				264	nritems = btrfs_header_nritems(path->nodes[0]);
				265	}
				266	leaf = path->nodes[0];
				267	slot = path->slots[0];
				268
				269	btrfs_item_key_to_cpu(leaf, &key, slot);
				270	if (key.type > BTRFS_EXTENT_DATA_KEY \|\|
				271	key.objectid != btrfs_ino(BTRFS_I(src)))
				272	break;
				273
				274	ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
				275
				276	extent = btrfs_item_ptr(leaf, slot,
				277	struct btrfs_file_extent_item);
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	278	type = btrfs_file_extent_type(leaf, extent);
				279	if (type == BTRFS_FILE_EXTENT_REG \|\|
				280	type == BTRFS_FILE_EXTENT_PREALLOC) {
				281	disko = btrfs_file_extent_disk_bytenr(leaf, extent);
				282	diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
				283	datao = btrfs_file_extent_offset(leaf, extent);
				284	datal = btrfs_file_extent_num_bytes(leaf, extent);
				285	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
				286	/* Take upper bound, may be compressed */
				287	datal = btrfs_file_extent_ram_bytes(leaf, extent);
				288	}
				289
				290	/*
				291	* The first search might have left us at an extent item that
				292	* ends before our target range's start, can happen if we have
				293	* holes and NO_HOLES feature enabled.
				294	*/
				295	if (key.offset + datal <= off) {
				296	path->slots[0]++;
				297	goto process_slot;
				298	} else if (key.offset >= off + len) {
				299	break;
				300	}
				301	next_key_min_offset = key.offset + datal;
				302	size = btrfs_item_size_nr(leaf, slot);
				303	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
				304	size);
				305
				306	btrfs_release_path(path);
				307	path->leave_spinning = 0;
				308
				309	memcpy(&new_key, &key, sizeof(new_key));
				310	new_key.objectid = btrfs_ino(BTRFS_I(inode));
				311	if (off <= key.offset)
				312	new_key.offset = key.offset + destoff - off;
				313	else
				314	new_key.offset = destoff;
				315
				316	/*
				317	* Deal with a hole that doesn't have an extent item that
				318	* represents it (NO_HOLES feature enabled).
				319	* This hole is either in the middle of the cloning range or at
				320	* the beginning (fully overlaps it or partially overlaps it).
				321	*/
				322	if (new_key.offset != last_dest_end)
				323	drop_start = last_dest_end;
				324	else
				325	drop_start = new_key.offset;
				326
				327	if (type == BTRFS_FILE_EXTENT_REG \|\|
				328	type == BTRFS_FILE_EXTENT_PREALLOC) {
				329	struct btrfs_clone_extent_info clone_info;
				330
				331	/*
				332	* a \| --- range to clone ---\| b
				333	* \| ------------- extent ------------- \|
				334	*/
				335
				336	/* Subtract range b */
				337	if (key.offset + datal > off + len)
				338	datal = off + len - key.offset;
				339
				340	/* Subtract range a */
				341	if (off > key.offset) {
				342	datao += off - key.offset;
				343	datal -= off - key.offset;
				344	}
				345
				346	clone_info.disk_offset = disko;
				347	clone_info.disk_len = diskl;
				348	clone_info.data_offset = datao;
				349	clone_info.data_len = datal;
				350	clone_info.file_offset = new_key.offset;
				351	clone_info.extent_buf = buf;
				352	clone_info.item_size = size;
				353	ret = btrfs_punch_hole_range(inode, path, drop_start,
				354	new_key.offset + datal - 1, &clone_info,
				355	&trans);
				356	if (ret)
				357	goto out;
				358	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
Filipe Manana	a61e1e0	2020-02-28 13:04:18 +0000	[diff] [blame^]	359	/*
				360	* Inline extents always have to start at file offset 0
				361	* and can never be bigger then the sector size. We can
				362	* never clone only parts of an inline extent, since all
				363	* reflink operations must start at a sector size aligned
				364	* offset, and the length must be aligned too or end at
				365	* the i_size (which implies the whole inlined data).
				366	*/
				367	ASSERT(key.offset == 0);
				368	ASSERT(datal <= fs_info->sectorsize);
				369	if (key.offset != 0 \|\| datal > fs_info->sectorsize)
				370	return -EUCLEAN;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	371
				372	/*
				373	* If our extent is inline, we know we will drop or
				374	* adjust at most 1 extent item in the destination root.
				375	*
				376	* 1 - adjusting old extent (we may have to split it)
				377	* 1 - add new extent
				378	* 1 - inode update
				379	*/
				380	trans = btrfs_start_transaction(root, 3);
				381	if (IS_ERR(trans)) {
				382	ret = PTR_ERR(trans);
				383	goto out;
				384	}
				385
				386	ret = clone_copy_inline_extent(inode, trans, path,
				387	&new_key, drop_start,
Filipe Manana	a61e1e0	2020-02-28 13:04:18 +0000	[diff] [blame^]	388	datal, size, buf);
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	389	if (ret) {
				390	if (ret != -EOPNOTSUPP)
				391	btrfs_abort_transaction(trans, ret);
				392	btrfs_end_transaction(trans);
				393	goto out;
				394	}
				395	}
				396
				397	btrfs_release_path(path);
				398
				399	last_dest_end = ALIGN(new_key.offset + datal,
				400	fs_info->sectorsize);
				401	ret = clone_finish_inode_update(trans, inode, last_dest_end,
				402	destoff, olen, no_time_update);
				403	if (ret)
				404	goto out;
				405	if (new_key.offset + datal >= destoff + len)
				406	break;
				407
				408	btrfs_release_path(path);
				409	key.offset = next_key_min_offset;
				410
				411	if (fatal_signal_pending(current)) {
				412	ret = -EINTR;
				413	goto out;
				414	}
				415	}
				416	ret = 0;
				417
				418	if (last_dest_end < destoff + len) {
				419	/*
				420	* We have an implicit hole that fully or partially overlaps our
				421	* cloning range at its end. This means that we either have the
				422	* NO_HOLES feature enabled or the implicit hole happened due to
				423	* mixing buffered and direct IO writes against this file.
				424	*/
				425	btrfs_release_path(path);
				426	path->leave_spinning = 0;
				427
				428	ret = btrfs_punch_hole_range(inode, path, last_dest_end,
				429	destoff + len - 1, NULL, &trans);
				430	if (ret)
				431	goto out;
				432
				433	ret = clone_finish_inode_update(trans, inode, destoff + len,
				434	destoff, olen, no_time_update);
				435	}
				436
				437	out:
				438	btrfs_free_path(path);
				439	kvfree(buf);
				440	return ret;
				441	}
				442
				443	static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
				444	struct inode *inode2, u64 loff2, u64 len)
				445	{
				446	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				447	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				448	}
				449
				450	static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
				451	struct inode *inode2, u64 loff2, u64 len)
				452	{
				453	if (inode1 < inode2) {
				454	swap(inode1, inode2);
				455	swap(loff1, loff2);
				456	} else if (inode1 == inode2 && loff2 < loff1) {
				457	swap(loff1, loff2);
				458	}
				459	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				460	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				461	}
				462
				463	static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
				464	struct inode *dst, u64 dst_loff)
				465	{
				466	const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
				467	int ret;
				468
				469	/*
				470	* Lock destination range to serialize with concurrent readpages() and
				471	* source range to serialize with relocation.
				472	*/
				473	btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
				474	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
				475	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
				476
				477	return ret;
				478	}
				479
				480	static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
				481	struct inode *dst, u64 dst_loff)
				482	{
				483	int ret;
				484	u64 i, tail_len, chunk_count;
				485	struct btrfs_root *root_dst = BTRFS_I(dst)->root;
				486
				487	spin_lock(&root_dst->root_item_lock);
				488	if (root_dst->send_in_progress) {
				489	btrfs_warn_rl(root_dst->fs_info,
				490	"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
				491	root_dst->root_key.objectid,
				492	root_dst->send_in_progress);
				493	spin_unlock(&root_dst->root_item_lock);
				494	return -EAGAIN;
				495	}
				496	root_dst->dedupe_in_progress++;
				497	spin_unlock(&root_dst->root_item_lock);
				498
				499	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
				500	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
				501
				502	for (i = 0; i < chunk_count; i++) {
				503	ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
				504	dst, dst_loff);
				505	if (ret)
				506	goto out;
				507
				508	loff += BTRFS_MAX_DEDUPE_LEN;
				509	dst_loff += BTRFS_MAX_DEDUPE_LEN;
				510	}
				511
				512	if (tail_len > 0)
				513	ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
				514	out:
				515	spin_lock(&root_dst->root_item_lock);
				516	root_dst->dedupe_in_progress--;
				517	spin_unlock(&root_dst->root_item_lock);
				518
				519	return ret;
				520	}
				521
				522	static noinline int btrfs_clone_files(struct file file, struct file file_src,
				523	u64 off, u64 olen, u64 destoff)
				524	{
				525	struct inode *inode = file_inode(file);
				526	struct inode *src = file_inode(file_src);
				527	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				528	int ret;
				529	u64 len = olen;
				530	u64 bs = fs_info->sb->s_blocksize;
				531
				532	/*
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	533	* VFS's generic_remap_file_range_prep() protects us from cloning the
				534	* eof block into the middle of a file, which would result in corruption
				535	* if the file size is not blocksize aligned. So we don't need to check
				536	* for that case here.
				537	*/
				538	if (off + len == src->i_size)
				539	len = ALIGN(src->i_size, bs) - off;
				540
				541	if (destoff > inode->i_size) {
				542	const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
				543
				544	ret = btrfs_cont_expand(inode, inode->i_size, destoff);
				545	if (ret)
				546	return ret;
				547	/*
				548	* We may have truncated the last block if the inode's size is
				549	* not sector size aligned, so we need to wait for writeback to
				550	* complete before proceeding further, otherwise we can race
				551	* with cloning and attempt to increment a reference to an
				552	* extent that no longer exists (writeback completed right after
				553	* we found the previous extent covering eof and before we
				554	* attempted to increment its reference count).
				555	*/
				556	ret = btrfs_wait_ordered_range(inode, wb_start,
				557	destoff - wb_start);
				558	if (ret)
				559	return ret;
				560	}
				561
				562	/*
				563	* Lock destination range to serialize with concurrent readpages() and
				564	* source range to serialize with relocation.
				565	*/
				566	btrfs_double_extent_lock(src, off, inode, destoff, len);
				567	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
				568	btrfs_double_extent_unlock(src, off, inode, destoff, len);
				569	/*
				570	* Truncate page cache pages so that future reads will see the cloned
				571	* data immediately and not the previous data.
				572	*/
				573	truncate_inode_pages_range(&inode->i_data,
				574	round_down(destoff, PAGE_SIZE),
				575	round_up(destoff + len, PAGE_SIZE) - 1);
				576
				577	return ret;
				578	}
				579
				580	static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
				581	struct file *file_out, loff_t pos_out,
				582	loff_t *len, unsigned int remap_flags)
				583	{
				584	struct inode *inode_in = file_inode(file_in);
				585	struct inode *inode_out = file_inode(file_out);
				586	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
				587	bool same_inode = inode_out == inode_in;
				588	u64 wb_len;
				589	int ret;
				590
				591	if (!(remap_flags & REMAP_FILE_DEDUP)) {
				592	struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
				593
				594	if (btrfs_root_readonly(root_out))
				595	return -EROFS;
				596
				597	if (file_in->f_path.mnt != file_out->f_path.mnt \|\|
				598	inode_in->i_sb != inode_out->i_sb)
				599	return -EXDEV;
				600	}
				601
				602	/* Don't make the dst file partly checksummed */
				603	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
				604	(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
				605	return -EINVAL;
				606	}
				607
				608	/*
				609	* Now that the inodes are locked, we need to start writeback ourselves
				610	* and can not rely on the writeback from the VFS's generic helper
				611	* generic_remap_file_range_prep() because:
				612	*
				613	* 1) For compression we must call filemap_fdatawrite_range() range
				614	* twice (btrfs_fdatawrite_range() does it for us), and the generic
				615	* helper only calls it once;
				616	*
				617	* 2) filemap_fdatawrite_range(), called by the generic helper only
				618	* waits for the writeback to complete, i.e. for IO to be done, and
				619	* not for the ordered extents to complete. We need to wait for them
				620	* to complete so that new file extent items are in the fs tree.
				621	*/
				622	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
				623	wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
				624	else
				625	wb_len = ALIGN(*len, bs);
				626
				627	/*
				628	* Since we don't lock ranges, wait for ongoing lockless dio writes (as
				629	* any in progress could create its ordered extents after we wait for
				630	* existing ordered extents below).
				631	*/
				632	inode_dio_wait(inode_in);
				633	if (!same_inode)
				634	inode_dio_wait(inode_out);
				635
				636	/*
				637	* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
				638	*
				639	* Btrfs' back references do not have a block level granularity, they
				640	* work at the whole extent level.
				641	* NOCOW buffered write without data space reserved may not be able
				642	* to fall back to CoW due to lack of data space, thus could cause
				643	* data loss.
				644	*
				645	* Here we take a shortcut by flushing the whole inode, so that all
				646	* nocow write should reach disk as nocow before we increase the
				647	* reference of the extent. We could do better by only flushing NOCOW
				648	* data, but that needs extra accounting.
				649	*
				650	* Also we don't need to check ASYNC_EXTENT, as async extent will be
				651	* CoWed anyway, not affecting nocow part.
				652	*/
				653	ret = filemap_flush(inode_in->i_mapping);
				654	if (ret < 0)
				655	return ret;
				656
				657	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
				658	wb_len);
				659	if (ret < 0)
				660	return ret;
				661	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
				662	wb_len);
				663	if (ret < 0)
				664	return ret;
				665
				666	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
				667	len, remap_flags);
				668	}
				669
				670	loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
				671	struct file *dst_file, loff_t destoff, loff_t len,
				672	unsigned int remap_flags)
				673	{
				674	struct inode *src_inode = file_inode(src_file);
				675	struct inode *dst_inode = file_inode(dst_file);
				676	bool same_inode = dst_inode == src_inode;
				677	int ret;
				678
				679	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
				680	return -EINVAL;
				681
				682	if (same_inode)
				683	inode_lock(src_inode);
				684	else
				685	lock_two_nondirectories(src_inode, dst_inode);
				686
				687	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
				688	&len, remap_flags);
				689	if (ret < 0 \|\| len == 0)
				690	goto out_unlock;
				691
				692	if (remap_flags & REMAP_FILE_DEDUP)
				693	ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
				694	else
				695	ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
				696
				697	out_unlock:
				698	if (same_inode)
				699	inode_unlock(src_inode);
				700	else
				701	unlock_two_nondirectories(src_inode, dst_inode);
				702
				703	return ret < 0 ? ret : len;
				704	}