Blame - fs/btrfs/reflink.c - SHIFTPHONES/kernel/common

blob: 99aa87c089121b79acf26bcd3503f7f73f69ad5e [file] [log] [blame]

Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	3	#include <linux/blkdev.h>
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	4	#include <linux/iversion.h>
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	5	#include "compression.h"
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	6	#include "ctree.h"
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	7	#include "delalloc-space.h"
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	8	#include "reflink.h"
				9	#include "transaction.h"
				10
				11	#define BTRFS_MAX_DEDUPE_LEN SZ_16M
				12
				13	static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
				14	struct inode *inode,
				15	u64 endoff,
				16	const u64 destoff,
				17	const u64 olen,
				18	int no_time_update)
				19	{
				20	struct btrfs_root *root = BTRFS_I(inode)->root;
				21	int ret;
				22
				23	inode_inc_iversion(inode);
				24	if (!no_time_update)
				25	inode->i_mtime = inode->i_ctime = current_time(inode);
				26	/*
				27	* We round up to the block size at eof when determining which
				28	* extents to clone above, but shouldn't round up the file size.
				29	*/
				30	if (endoff > destoff + olen)
				31	endoff = destoff + olen;
				32	if (endoff > inode->i_size) {
				33	i_size_write(inode, endoff);
				34	btrfs_inode_safe_disk_i_size_write(inode, 0);
				35	}
				36
				37	ret = btrfs_update_inode(trans, root, inode);
				38	if (ret) {
				39	btrfs_abort_transaction(trans, ret);
				40	btrfs_end_transaction(trans);
				41	goto out;
				42	}
				43	ret = btrfs_end_transaction(trans);
				44	out:
				45	return ret;
				46	}
				47
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	48	static int copy_inline_to_page(struct btrfs_inode *inode,
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	49	const u64 file_offset,
				50	char *inline_data,
				51	const u64 size,
				52	const u64 datal,
				53	const u8 comp_type)
				54	{
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	55	const u64 block_size = btrfs_inode_sectorsize(inode);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	56	const u64 range_end = file_offset + block_size - 1;
				57	const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
				58	char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
				59	struct extent_changeset *data_reserved = NULL;
				60	struct page *page = NULL;
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	61	struct address_space *mapping = inode->vfs_inode.i_mapping;
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	62	int ret;
				63
				64	ASSERT(IS_ALIGNED(file_offset, block_size));
				65
				66	/*
				67	* We have flushed and locked the ranges of the source and destination
				68	* inodes, we also have locked the inodes, so we are safe to do a
				69	* reservation here. Also we must not do the reservation while holding
				70	* a transaction open, otherwise we would deadlock.
				71	*/
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	72	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset,
				73	block_size);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	74	if (ret)
				75	goto out;
				76
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	77	page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT,
				78	btrfs_alloc_write_mask(mapping));
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	79	if (!page) {
				80	ret = -ENOMEM;
				81	goto out_unlock;
				82	}
				83
				84	set_page_extent_mapped(page);
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	85	clear_extent_bit(&inode->io_tree, file_offset, range_end,
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	86	EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
				87	0, 0, NULL);
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	88	ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	89	if (ret)
				90	goto out_unlock;
				91
				92	if (comp_type == BTRFS_COMPRESS_NONE) {
				93	char *map;
				94
				95	map = kmap(page);
				96	memcpy(map, data_start, datal);
				97	flush_dcache_page(page);
				98	kunmap(page);
				99	} else {
				100	ret = btrfs_decompress(comp_type, data_start, page, 0,
				101	inline_size, datal);
				102	if (ret)
				103	goto out_unlock;
				104	flush_dcache_page(page);
				105	}
				106
				107	/*
				108	* If our inline data is smaller then the block/page size, then the
				109	* remaining of the block/page is equivalent to zeroes. We had something
				110	* like the following done:
				111	*
				112	* $ xfs_io -f -c "pwrite -S 0xab 0 500" file
				113	* $ sync # (or fsync)
				114	* $ xfs_io -c "falloc 0 4K" file
				115	* $ xfs_io -c "pwrite -S 0xcd 4K 4K"
				116	*
				117	* So what's in the range [500, 4095] corresponds to zeroes.
				118	*/
				119	if (datal < block_size) {
				120	char *map;
				121
				122	map = kmap(page);
				123	memset(map + datal, 0, block_size - datal);
				124	flush_dcache_page(page);
				125	kunmap(page);
				126	}
				127
				128	SetPageUptodate(page);
				129	ClearPageChecked(page);
				130	set_page_dirty(page);
				131	out_unlock:
				132	if (page) {
				133	unlock_page(page);
				134	put_page(page);
				135	}
				136	if (ret)
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	137	btrfs_delalloc_release_space(inode, data_reserved, file_offset,
				138	block_size, true);
				139	btrfs_delalloc_release_extents(inode, block_size);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	140	out:
				141	extent_changeset_free(data_reserved);
				142
				143	return ret;
				144	}
				145
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	146	/*
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	147	* Deal with cloning of inline extents. We try to copy the inline extent from
				148	* the source inode to destination inode when possible. When not possible we
				149	* copy the inline extent's data into the respective page of the inode.
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	150	*/
				151	static int clone_copy_inline_extent(struct inode *dst,
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	152	struct btrfs_path *path,
				153	struct btrfs_key *new_key,
				154	const u64 drop_start,
				155	const u64 datal,
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	156	const u64 size,
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	157	const u8 comp_type,
				158	char *inline_data,
				159	struct btrfs_trans_handle **trans_out)
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	160	{
				161	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
				162	struct btrfs_root *root = BTRFS_I(dst)->root;
				163	const u64 aligned_end = ALIGN(new_key->offset + datal,
				164	fs_info->sectorsize);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	165	struct btrfs_trans_handle *trans = NULL;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	166	int ret;
				167	struct btrfs_key key;
				168
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	169	if (new_key->offset > 0) {
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	170	ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
				171	inline_data, size, datal, comp_type);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	172	goto out;
				173	}
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	174
				175	key.objectid = btrfs_ino(BTRFS_I(dst));
				176	key.type = BTRFS_EXTENT_DATA_KEY;
				177	key.offset = 0;
				178	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				179	if (ret < 0) {
				180	return ret;
				181	} else if (ret > 0) {
				182	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				183	ret = btrfs_next_leaf(root, path);
				184	if (ret < 0)
				185	return ret;
				186	else if (ret > 0)
				187	goto copy_inline_extent;
				188	}
				189	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				190	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				191	key.type == BTRFS_EXTENT_DATA_KEY) {
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	192	/*
				193	* There's an implicit hole at file offset 0, copy the
				194	* inline extent's data to the page.
				195	*/
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	196	ASSERT(key.offset > 0);
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	197	ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	198	inline_data, size, datal,
				199	comp_type);
				200	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	201	}
				202	} else if (i_size_read(dst) <= datal) {
				203	struct btrfs_file_extent_item *ei;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	204
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	205	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				206	struct btrfs_file_extent_item);
				207	/*
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	208	* If it's an inline extent replace it with the source inline
				209	* extent, otherwise copy the source inline extent data into
				210	* the respective page at the destination inode.
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	211	*/
				212	if (btrfs_file_extent_type(path->nodes[0], ei) ==
				213	BTRFS_FILE_EXTENT_INLINE)
				214	goto copy_inline_extent;
				215
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	216	ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
				217	inline_data, size, datal, comp_type);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	218	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	219	}
				220
				221	copy_inline_extent:
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	222	ret = 0;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	223	/*
				224	* We have no extent items, or we have an extent at offset 0 which may
				225	* or may not be inlined. All these cases are dealt the same way.
				226	*/
				227	if (i_size_read(dst) > datal) {
				228	/*
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	229	* At the destination offset 0 we have either a hole, a regular
				230	* extent or an inline extent larger then the one we want to
				231	* clone. Deal with all these cases by copying the inline extent
				232	* data into the respective page at the destination inode.
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	233	*/
Nikolay Borisov	998acfe	2020-08-31 14:42:47 +0300	[diff] [blame]	234	ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
				235	inline_data, size, datal, comp_type);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	236	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	237	}
				238
				239	btrfs_release_path(path);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	240	/*
				241	* If we end up here it means were copy the inline extent into a leaf
				242	* of the destination inode. We know we will drop or adjust at most one
				243	* extent item in the destination root.
				244	*
				245	* 1 unit - adjusting old extent (we may have to split it)
				246	* 1 unit - add new extent
				247	* 1 unit - inode update
				248	*/
				249	trans = btrfs_start_transaction(root, 3);
				250	if (IS_ERR(trans)) {
				251	ret = PTR_ERR(trans);
				252	trans = NULL;
				253	goto out;
				254	}
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	255	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
				256	if (ret)
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	257	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	258	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
				259	if (ret)
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	260	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	261
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	262	write_extent_buffer(path->nodes[0], inline_data,
				263	btrfs_item_ptr_offset(path->nodes[0],
				264	path->slots[0]),
				265	size);
				266	inode_add_bytes(dst, datal);
				267	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
Filipe Manana	4fdb688	2020-04-04 21:20:22 +0100	[diff] [blame]	268	ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	269	out:
				270	if (!ret && !trans) {
				271	/*
				272	* No transaction here means we copied the inline extent into a
				273	* page of the destination inode.
				274	*
				275	* 1 unit to update inode item
				276	*/
				277	trans = btrfs_start_transaction(root, 1);
				278	if (IS_ERR(trans)) {
				279	ret = PTR_ERR(trans);
				280	trans = NULL;
				281	}
				282	}
				283	if (ret && trans) {
				284	btrfs_abort_transaction(trans, ret);
				285	btrfs_end_transaction(trans);
				286	}
				287	if (!ret)
				288	*trans_out = trans;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	289
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	290	return ret;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	291	}
				292
				293	/**
				294	* btrfs_clone() - clone a range from inode file to another
				295	*
				296	* @src: Inode to clone from
				297	* @inode: Inode to clone to
				298	* @off: Offset within source to start clone from
				299	* @olen: Original length, passed by user, of range to clone
				300	* @olen_aligned: Block-aligned value of olen
				301	* @destoff: Offset within @inode to start clone
				302	* @no_time_update: Whether to update mtime/ctime on the target inode
				303	*/
				304	static int btrfs_clone(struct inode src, struct inode inode,
				305	const u64 off, const u64 olen, const u64 olen_aligned,
				306	const u64 destoff, int no_time_update)
				307	{
				308	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	309	struct btrfs_path *path = NULL;
				310	struct extent_buffer *leaf;
				311	struct btrfs_trans_handle *trans;
				312	char *buf = NULL;
				313	struct btrfs_key key;
				314	u32 nritems;
				315	int slot;
				316	int ret;
				317	const u64 len = olen_aligned;
				318	u64 last_dest_end = destoff;
				319
				320	ret = -ENOMEM;
				321	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
				322	if (!buf)
				323	return ret;
				324
				325	path = btrfs_alloc_path();
				326	if (!path) {
				327	kvfree(buf);
				328	return ret;
				329	}
				330
				331	path->reada = READA_FORWARD;
				332	/* Clone data */
				333	key.objectid = btrfs_ino(BTRFS_I(src));
				334	key.type = BTRFS_EXTENT_DATA_KEY;
				335	key.offset = off;
				336
				337	while (1) {
				338	u64 next_key_min_offset = key.offset + 1;
				339	struct btrfs_file_extent_item *extent;
Filipe Manana	3ebac17	2020-07-15 12:30:43 +0100	[diff] [blame]	340	u64 extent_gen;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	341	int type;
				342	u32 size;
				343	struct btrfs_key new_key;
				344	u64 disko = 0, diskl = 0;
				345	u64 datao = 0, datal = 0;
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	346	u8 comp;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	347	u64 drop_start;
				348
				349	/* Note the key will change type as we walk through the tree */
				350	path->leave_spinning = 1;
				351	ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
				352	0, 0);
				353	if (ret < 0)
				354	goto out;
				355	/*
				356	* First search, if no extent item that starts at offset off was
				357	* found but the previous item is an extent item, it's possible
				358	* it might overlap our target range, therefore process it.
				359	*/
				360	if (key.offset == off && ret > 0 && path->slots[0] > 0) {
				361	btrfs_item_key_to_cpu(path->nodes[0], &key,
				362	path->slots[0] - 1);
				363	if (key.type == BTRFS_EXTENT_DATA_KEY)
				364	path->slots[0]--;
				365	}
				366
				367	nritems = btrfs_header_nritems(path->nodes[0]);
				368	process_slot:
				369	if (path->slots[0] >= nritems) {
				370	ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
				371	if (ret < 0)
				372	goto out;
				373	if (ret > 0)
				374	break;
				375	nritems = btrfs_header_nritems(path->nodes[0]);
				376	}
				377	leaf = path->nodes[0];
				378	slot = path->slots[0];
				379
				380	btrfs_item_key_to_cpu(leaf, &key, slot);
				381	if (key.type > BTRFS_EXTENT_DATA_KEY \|\|
				382	key.objectid != btrfs_ino(BTRFS_I(src)))
				383	break;
				384
				385	ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
				386
				387	extent = btrfs_item_ptr(leaf, slot,
				388	struct btrfs_file_extent_item);
Filipe Manana	3ebac17	2020-07-15 12:30:43 +0100	[diff] [blame]	389	extent_gen = btrfs_file_extent_generation(leaf, extent);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	390	comp = btrfs_file_extent_compression(leaf, extent);
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	391	type = btrfs_file_extent_type(leaf, extent);
				392	if (type == BTRFS_FILE_EXTENT_REG \|\|
				393	type == BTRFS_FILE_EXTENT_PREALLOC) {
				394	disko = btrfs_file_extent_disk_bytenr(leaf, extent);
				395	diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
				396	datao = btrfs_file_extent_offset(leaf, extent);
				397	datal = btrfs_file_extent_num_bytes(leaf, extent);
				398	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
				399	/* Take upper bound, may be compressed */
				400	datal = btrfs_file_extent_ram_bytes(leaf, extent);
				401	}
				402
				403	/*
				404	* The first search might have left us at an extent item that
				405	* ends before our target range's start, can happen if we have
				406	* holes and NO_HOLES feature enabled.
				407	*/
				408	if (key.offset + datal <= off) {
				409	path->slots[0]++;
				410	goto process_slot;
				411	} else if (key.offset >= off + len) {
				412	break;
				413	}
				414	next_key_min_offset = key.offset + datal;
				415	size = btrfs_item_size_nr(leaf, slot);
				416	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
				417	size);
				418
				419	btrfs_release_path(path);
				420	path->leave_spinning = 0;
				421
				422	memcpy(&new_key, &key, sizeof(new_key));
				423	new_key.objectid = btrfs_ino(BTRFS_I(inode));
				424	if (off <= key.offset)
				425	new_key.offset = key.offset + destoff - off;
				426	else
				427	new_key.offset = destoff;
				428
				429	/*
				430	* Deal with a hole that doesn't have an extent item that
				431	* represents it (NO_HOLES feature enabled).
				432	* This hole is either in the middle of the cloning range or at
				433	* the beginning (fully overlaps it or partially overlaps it).
				434	*/
				435	if (new_key.offset != last_dest_end)
				436	drop_start = last_dest_end;
				437	else
				438	drop_start = new_key.offset;
				439
				440	if (type == BTRFS_FILE_EXTENT_REG \|\|
				441	type == BTRFS_FILE_EXTENT_PREALLOC) {
Filipe Manana	bf38564	2020-09-08 11:27:22 +0100	[diff] [blame]	442	struct btrfs_replace_extent_info clone_info;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	443
				444	/*
				445	* a \| --- range to clone ---\| b
				446	* \| ------------- extent ------------- \|
				447	*/
				448
				449	/* Subtract range b */
				450	if (key.offset + datal > off + len)
				451	datal = off + len - key.offset;
				452
				453	/* Subtract range a */
				454	if (off > key.offset) {
				455	datao += off - key.offset;
				456	datal -= off - key.offset;
				457	}
				458
				459	clone_info.disk_offset = disko;
				460	clone_info.disk_len = diskl;
				461	clone_info.data_offset = datao;
				462	clone_info.data_len = datal;
				463	clone_info.file_offset = new_key.offset;
				464	clone_info.extent_buf = buf;
Filipe Manana	8fccebf	2020-09-08 11:27:20 +0100	[diff] [blame]	465	clone_info.is_new_extent = false;
Filipe Manana	306bfec	2020-09-08 11:27:23 +0100	[diff] [blame]	466	ret = btrfs_replace_file_extents(inode, path, drop_start,
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	467	new_key.offset + datal - 1, &clone_info,
				468	&trans);
				469	if (ret)
				470	goto out;
				471	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
Filipe Manana	a61e1e0	2020-02-28 13:04:18 +0000	[diff] [blame]	472	/*
				473	* Inline extents always have to start at file offset 0
				474	* and can never be bigger then the sector size. We can
				475	* never clone only parts of an inline extent, since all
				476	* reflink operations must start at a sector size aligned
				477	* offset, and the length must be aligned too or end at
				478	* the i_size (which implies the whole inlined data).
				479	*/
				480	ASSERT(key.offset == 0);
				481	ASSERT(datal <= fs_info->sectorsize);
				482	if (key.offset != 0 \|\| datal > fs_info->sectorsize)
				483	return -EUCLEAN;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	484
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	485	ret = clone_copy_inline_extent(inode, path, &new_key,
				486	drop_start, datal, size,
				487	comp, buf, &trans);
				488	if (ret)
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	489	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	490	}
				491
				492	btrfs_release_path(path);
				493
Filipe Manana	3ebac17	2020-07-15 12:30:43 +0100	[diff] [blame]	494	/*
				495	* If this is a new extent update the last_reflink_trans of both
				496	* inodes. This is used by fsync to make sure it does not log
				497	* multiple checksum items with overlapping ranges. For older
				498	* extents we don't need to do it since inode logging skips the
				499	* checksums for older extents. Also ignore holes and inline
				500	* extents because they don't have checksums in the csum tree.
				501	*/
				502	if (extent_gen == trans->transid && disko > 0) {
				503	BTRFS_I(src)->last_reflink_trans = trans->transid;
				504	BTRFS_I(inode)->last_reflink_trans = trans->transid;
				505	}
				506
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	507	last_dest_end = ALIGN(new_key.offset + datal,
				508	fs_info->sectorsize);
				509	ret = clone_finish_inode_update(trans, inode, last_dest_end,
				510	destoff, olen, no_time_update);
				511	if (ret)
				512	goto out;
				513	if (new_key.offset + datal >= destoff + len)
				514	break;
				515
				516	btrfs_release_path(path);
				517	key.offset = next_key_min_offset;
				518
				519	if (fatal_signal_pending(current)) {
				520	ret = -EINTR;
				521	goto out;
				522	}
Johannes Thumshirn	6b613cc	2020-09-22 17:27:29 +0900	[diff] [blame^]	523
				524	cond_resched();
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	525	}
				526	ret = 0;
				527
				528	if (last_dest_end < destoff + len) {
				529	/*
				530	* We have an implicit hole that fully or partially overlaps our
				531	* cloning range at its end. This means that we either have the
				532	* NO_HOLES feature enabled or the implicit hole happened due to
				533	* mixing buffered and direct IO writes against this file.
				534	*/
				535	btrfs_release_path(path);
				536	path->leave_spinning = 0;
				537
Filipe Manana	306bfec	2020-09-08 11:27:23 +0100	[diff] [blame]	538	ret = btrfs_replace_file_extents(inode, path, last_dest_end,
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	539	destoff + len - 1, NULL, &trans);
				540	if (ret)
				541	goto out;
				542
				543	ret = clone_finish_inode_update(trans, inode, destoff + len,
				544	destoff, olen, no_time_update);
				545	}
				546
				547	out:
				548	btrfs_free_path(path);
				549	kvfree(buf);
				550	return ret;
				551	}
				552
				553	static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
				554	struct inode *inode2, u64 loff2, u64 len)
				555	{
				556	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				557	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				558	}
				559
				560	static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
				561	struct inode *inode2, u64 loff2, u64 len)
				562	{
				563	if (inode1 < inode2) {
				564	swap(inode1, inode2);
				565	swap(loff1, loff2);
				566	} else if (inode1 == inode2 && loff2 < loff1) {
				567	swap(loff1, loff2);
				568	}
				569	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				570	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				571	}
				572
				573	static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
				574	struct inode *dst, u64 dst_loff)
				575	{
				576	const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
				577	int ret;
				578
				579	/*
				580	* Lock destination range to serialize with concurrent readpages() and
				581	* source range to serialize with relocation.
				582	*/
				583	btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
				584	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
				585	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
				586
				587	return ret;
				588	}
				589
				590	static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
				591	struct inode *dst, u64 dst_loff)
				592	{
				593	int ret;
				594	u64 i, tail_len, chunk_count;
				595	struct btrfs_root *root_dst = BTRFS_I(dst)->root;
				596
				597	spin_lock(&root_dst->root_item_lock);
				598	if (root_dst->send_in_progress) {
				599	btrfs_warn_rl(root_dst->fs_info,
				600	"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
				601	root_dst->root_key.objectid,
				602	root_dst->send_in_progress);
				603	spin_unlock(&root_dst->root_item_lock);
				604	return -EAGAIN;
				605	}
				606	root_dst->dedupe_in_progress++;
				607	spin_unlock(&root_dst->root_item_lock);
				608
				609	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
				610	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
				611
				612	for (i = 0; i < chunk_count; i++) {
				613	ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
				614	dst, dst_loff);
				615	if (ret)
				616	goto out;
				617
				618	loff += BTRFS_MAX_DEDUPE_LEN;
				619	dst_loff += BTRFS_MAX_DEDUPE_LEN;
				620	}
				621
				622	if (tail_len > 0)
				623	ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
				624	out:
				625	spin_lock(&root_dst->root_item_lock);
				626	root_dst->dedupe_in_progress--;
				627	spin_unlock(&root_dst->root_item_lock);
				628
				629	return ret;
				630	}
				631
				632	static noinline int btrfs_clone_files(struct file file, struct file file_src,
				633	u64 off, u64 olen, u64 destoff)
				634	{
				635	struct inode *inode = file_inode(file);
				636	struct inode *src = file_inode(file_src);
				637	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				638	int ret;
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	639	int wb_ret;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	640	u64 len = olen;
				641	u64 bs = fs_info->sb->s_blocksize;
				642
				643	/*
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	644	* VFS's generic_remap_file_range_prep() protects us from cloning the
				645	* eof block into the middle of a file, which would result in corruption
				646	* if the file size is not blocksize aligned. So we don't need to check
				647	* for that case here.
				648	*/
				649	if (off + len == src->i_size)
				650	len = ALIGN(src->i_size, bs) - off;
				651
				652	if (destoff > inode->i_size) {
				653	const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
				654
				655	ret = btrfs_cont_expand(inode, inode->i_size, destoff);
				656	if (ret)
				657	return ret;
				658	/*
				659	* We may have truncated the last block if the inode's size is
				660	* not sector size aligned, so we need to wait for writeback to
				661	* complete before proceeding further, otherwise we can race
				662	* with cloning and attempt to increment a reference to an
				663	* extent that no longer exists (writeback completed right after
				664	* we found the previous extent covering eof and before we
				665	* attempted to increment its reference count).
				666	*/
				667	ret = btrfs_wait_ordered_range(inode, wb_start,
				668	destoff - wb_start);
				669	if (ret)
				670	return ret;
				671	}
				672
				673	/*
				674	* Lock destination range to serialize with concurrent readpages() and
				675	* source range to serialize with relocation.
				676	*/
				677	btrfs_double_extent_lock(src, off, inode, destoff, len);
				678	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
				679	btrfs_double_extent_unlock(src, off, inode, destoff, len);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	680
				681	/*
				682	* We may have copied an inline extent into a page of the destination
				683	* range, so wait for writeback to complete before truncating pages
				684	* from the page cache. This is a rare case.
				685	*/
				686	wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
				687	ret = ret ? ret : wb_ret;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	688	/*
				689	* Truncate page cache pages so that future reads will see the cloned
				690	* data immediately and not the previous data.
				691	*/
				692	truncate_inode_pages_range(&inode->i_data,
				693	round_down(destoff, PAGE_SIZE),
				694	round_up(destoff + len, PAGE_SIZE) - 1);
				695
				696	return ret;
				697	}
				698
				699	static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
				700	struct file *file_out, loff_t pos_out,
				701	loff_t *len, unsigned int remap_flags)
				702	{
				703	struct inode *inode_in = file_inode(file_in);
				704	struct inode *inode_out = file_inode(file_out);
				705	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
				706	bool same_inode = inode_out == inode_in;
				707	u64 wb_len;
				708	int ret;
				709
				710	if (!(remap_flags & REMAP_FILE_DEDUP)) {
				711	struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
				712
				713	if (btrfs_root_readonly(root_out))
				714	return -EROFS;
				715
				716	if (file_in->f_path.mnt != file_out->f_path.mnt \|\|
				717	inode_in->i_sb != inode_out->i_sb)
				718	return -EXDEV;
				719	}
				720
				721	/* Don't make the dst file partly checksummed */
				722	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
				723	(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
				724	return -EINVAL;
				725	}
				726
				727	/*
				728	* Now that the inodes are locked, we need to start writeback ourselves
				729	* and can not rely on the writeback from the VFS's generic helper
				730	* generic_remap_file_range_prep() because:
				731	*
				732	* 1) For compression we must call filemap_fdatawrite_range() range
				733	* twice (btrfs_fdatawrite_range() does it for us), and the generic
				734	* helper only calls it once;
				735	*
				736	* 2) filemap_fdatawrite_range(), called by the generic helper only
				737	* waits for the writeback to complete, i.e. for IO to be done, and
				738	* not for the ordered extents to complete. We need to wait for them
				739	* to complete so that new file extent items are in the fs tree.
				740	*/
				741	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
				742	wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
				743	else
				744	wb_len = ALIGN(*len, bs);
				745
				746	/*
				747	* Since we don't lock ranges, wait for ongoing lockless dio writes (as
				748	* any in progress could create its ordered extents after we wait for
				749	* existing ordered extents below).
				750	*/
				751	inode_dio_wait(inode_in);
				752	if (!same_inode)
				753	inode_dio_wait(inode_out);
				754
				755	/*
				756	* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
				757	*
				758	* Btrfs' back references do not have a block level granularity, they
				759	* work at the whole extent level.
				760	* NOCOW buffered write without data space reserved may not be able
				761	* to fall back to CoW due to lack of data space, thus could cause
				762	* data loss.
				763	*
				764	* Here we take a shortcut by flushing the whole inode, so that all
				765	* nocow write should reach disk as nocow before we increase the
				766	* reference of the extent. We could do better by only flushing NOCOW
				767	* data, but that needs extra accounting.
				768	*
				769	* Also we don't need to check ASYNC_EXTENT, as async extent will be
				770	* CoWed anyway, not affecting nocow part.
				771	*/
				772	ret = filemap_flush(inode_in->i_mapping);
				773	if (ret < 0)
				774	return ret;
				775
				776	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
				777	wb_len);
				778	if (ret < 0)
				779	return ret;
				780	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
				781	wb_len);
				782	if (ret < 0)
				783	return ret;
				784
				785	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
				786	len, remap_flags);
				787	}
				788
				789	loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
				790	struct file *dst_file, loff_t destoff, loff_t len,
				791	unsigned int remap_flags)
				792	{
				793	struct inode *src_inode = file_inode(src_file);
				794	struct inode *dst_inode = file_inode(dst_file);
				795	bool same_inode = dst_inode == src_inode;
				796	int ret;
				797
				798	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
				799	return -EINVAL;
				800
				801	if (same_inode)
				802	inode_lock(src_inode);
				803	else
				804	lock_two_nondirectories(src_inode, dst_inode);
				805
				806	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
				807	&len, remap_flags);
				808	if (ret < 0 \|\| len == 0)
				809	goto out_unlock;
				810
				811	if (remap_flags & REMAP_FILE_DEDUP)
				812	ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
				813	else
				814	ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
				815
				816	out_unlock:
				817	if (same_inode)
				818	inode_unlock(src_inode);
				819	else
				820	unlock_two_nondirectories(src_inode, dst_inode);
				821
				822	return ret < 0 ? ret : len;
				823	}