Blame - fs/btrfs/reflink.c - SHIFTPHONES/kernel/shift/mainline

blob: 5cd02514cf4d48505c2c2d3a43e5022331dbd8a8 [file] [log] [blame]

Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	3	#include <linux/blkdev.h>
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	4	#include <linux/iversion.h>
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	5	#include "compression.h"
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	6	#include "ctree.h"
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	7	#include "delalloc-space.h"
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	8	#include "reflink.h"
				9	#include "transaction.h"
				10
				11	#define BTRFS_MAX_DEDUPE_LEN SZ_16M
				12
				13	static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
				14	struct inode *inode,
				15	u64 endoff,
				16	const u64 destoff,
				17	const u64 olen,
				18	int no_time_update)
				19	{
				20	struct btrfs_root *root = BTRFS_I(inode)->root;
				21	int ret;
				22
				23	inode_inc_iversion(inode);
				24	if (!no_time_update)
				25	inode->i_mtime = inode->i_ctime = current_time(inode);
				26	/*
				27	* We round up to the block size at eof when determining which
				28	* extents to clone above, but shouldn't round up the file size.
				29	*/
				30	if (endoff > destoff + olen)
				31	endoff = destoff + olen;
				32	if (endoff > inode->i_size) {
				33	i_size_write(inode, endoff);
				34	btrfs_inode_safe_disk_i_size_write(inode, 0);
				35	}
				36
				37	ret = btrfs_update_inode(trans, root, inode);
				38	if (ret) {
				39	btrfs_abort_transaction(trans, ret);
				40	btrfs_end_transaction(trans);
				41	goto out;
				42	}
				43	ret = btrfs_end_transaction(trans);
				44	out:
				45	return ret;
				46	}
				47
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	48	static int copy_inline_to_page(struct inode *inode,
				49	const u64 file_offset,
				50	char *inline_data,
				51	const u64 size,
				52	const u64 datal,
				53	const u8 comp_type)
				54	{
				55	const u64 block_size = btrfs_inode_sectorsize(inode);
				56	const u64 range_end = file_offset + block_size - 1;
				57	const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0);
				58	char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0);
				59	struct extent_changeset *data_reserved = NULL;
				60	struct page *page = NULL;
				61	int ret;
				62
				63	ASSERT(IS_ALIGNED(file_offset, block_size));
				64
				65	/*
				66	* We have flushed and locked the ranges of the source and destination
				67	* inodes, we also have locked the inodes, so we are safe to do a
				68	* reservation here. Also we must not do the reservation while holding
				69	* a transaction open, otherwise we would deadlock.
				70	*/
Nikolay Borisov	e5b7231e	2020-06-03 08:55:42 +0300	[diff] [blame]	71	ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
				72	file_offset, block_size);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	73	if (ret)
				74	goto out;
				75
				76	page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT,
				77	btrfs_alloc_write_mask(inode->i_mapping));
				78	if (!page) {
				79	ret = -ENOMEM;
				80	goto out_unlock;
				81	}
				82
				83	set_page_extent_mapped(page);
				84	clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end,
				85	EXTENT_DELALLOC \| EXTENT_DO_ACCOUNTING \| EXTENT_DEFRAG,
				86	0, 0, NULL);
Nikolay Borisov	c2566f2	2020-06-03 08:55:35 +0300	[diff] [blame]	87	ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end,
				88	0, NULL);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	89	if (ret)
				90	goto out_unlock;
				91
				92	if (comp_type == BTRFS_COMPRESS_NONE) {
				93	char *map;
				94
				95	map = kmap(page);
				96	memcpy(map, data_start, datal);
				97	flush_dcache_page(page);
				98	kunmap(page);
				99	} else {
				100	ret = btrfs_decompress(comp_type, data_start, page, 0,
				101	inline_size, datal);
				102	if (ret)
				103	goto out_unlock;
				104	flush_dcache_page(page);
				105	}
				106
				107	/*
				108	* If our inline data is smaller then the block/page size, then the
				109	* remaining of the block/page is equivalent to zeroes. We had something
				110	* like the following done:
				111	*
				112	* $ xfs_io -f -c "pwrite -S 0xab 0 500" file
				113	* $ sync # (or fsync)
				114	* $ xfs_io -c "falloc 0 4K" file
				115	* $ xfs_io -c "pwrite -S 0xcd 4K 4K"
				116	*
				117	* So what's in the range [500, 4095] corresponds to zeroes.
				118	*/
				119	if (datal < block_size) {
				120	char *map;
				121
				122	map = kmap(page);
				123	memset(map + datal, 0, block_size - datal);
				124	flush_dcache_page(page);
				125	kunmap(page);
				126	}
				127
				128	SetPageUptodate(page);
				129	ClearPageChecked(page);
				130	set_page_dirty(page);
				131	out_unlock:
				132	if (page) {
				133	unlock_page(page);
				134	put_page(page);
				135	}
				136	if (ret)
Nikolay Borisov	86d5292	2020-06-03 08:55:40 +0300	[diff] [blame]	137	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved,
				138	file_offset, block_size, true);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	139	btrfs_delalloc_release_extents(BTRFS_I(inode), block_size);
				140	out:
				141	extent_changeset_free(data_reserved);
				142
				143	return ret;
				144	}
				145
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	146	/*
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	147	* Deal with cloning of inline extents. We try to copy the inline extent from
				148	* the source inode to destination inode when possible. When not possible we
				149	* copy the inline extent's data into the respective page of the inode.
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	150	*/
				151	static int clone_copy_inline_extent(struct inode *dst,
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	152	struct btrfs_path *path,
				153	struct btrfs_key *new_key,
				154	const u64 drop_start,
				155	const u64 datal,
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	156	const u64 size,
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	157	const u8 comp_type,
				158	char *inline_data,
				159	struct btrfs_trans_handle **trans_out)
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	160	{
				161	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
				162	struct btrfs_root *root = BTRFS_I(dst)->root;
				163	const u64 aligned_end = ALIGN(new_key->offset + datal,
				164	fs_info->sectorsize);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	165	struct btrfs_trans_handle *trans = NULL;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	166	int ret;
				167	struct btrfs_key key;
				168
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	169	if (new_key->offset > 0) {
				170	ret = copy_inline_to_page(dst, new_key->offset, inline_data,
				171	size, datal, comp_type);
				172	goto out;
				173	}
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	174
				175	key.objectid = btrfs_ino(BTRFS_I(dst));
				176	key.type = BTRFS_EXTENT_DATA_KEY;
				177	key.offset = 0;
				178	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				179	if (ret < 0) {
				180	return ret;
				181	} else if (ret > 0) {
				182	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
				183	ret = btrfs_next_leaf(root, path);
				184	if (ret < 0)
				185	return ret;
				186	else if (ret > 0)
				187	goto copy_inline_extent;
				188	}
				189	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				190	if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
				191	key.type == BTRFS_EXTENT_DATA_KEY) {
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	192	/*
				193	* There's an implicit hole at file offset 0, copy the
				194	* inline extent's data to the page.
				195	*/
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	196	ASSERT(key.offset > 0);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	197	ret = copy_inline_to_page(dst, new_key->offset,
				198	inline_data, size, datal,
				199	comp_type);
				200	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	201	}
				202	} else if (i_size_read(dst) <= datal) {
				203	struct btrfs_file_extent_item *ei;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	204
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	205	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
				206	struct btrfs_file_extent_item);
				207	/*
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	208	* If it's an inline extent replace it with the source inline
				209	* extent, otherwise copy the source inline extent data into
				210	* the respective page at the destination inode.
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	211	*/
				212	if (btrfs_file_extent_type(path->nodes[0], ei) ==
				213	BTRFS_FILE_EXTENT_INLINE)
				214	goto copy_inline_extent;
				215
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	216	ret = copy_inline_to_page(dst, new_key->offset, inline_data,
				217	size, datal, comp_type);
				218	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	219	}
				220
				221	copy_inline_extent:
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	222	ret = 0;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	223	/*
				224	* We have no extent items, or we have an extent at offset 0 which may
				225	* or may not be inlined. All these cases are dealt the same way.
				226	*/
				227	if (i_size_read(dst) > datal) {
				228	/*
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	229	* At the destination offset 0 we have either a hole, a regular
				230	* extent or an inline extent larger then the one we want to
				231	* clone. Deal with all these cases by copying the inline extent
				232	* data into the respective page at the destination inode.
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	233	*/
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	234	ret = copy_inline_to_page(dst, new_key->offset, inline_data,
				235	size, datal, comp_type);
				236	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	237	}
				238
				239	btrfs_release_path(path);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	240	/*
				241	* If we end up here it means were copy the inline extent into a leaf
				242	* of the destination inode. We know we will drop or adjust at most one
				243	* extent item in the destination root.
				244	*
				245	* 1 unit - adjusting old extent (we may have to split it)
				246	* 1 unit - add new extent
				247	* 1 unit - inode update
				248	*/
				249	trans = btrfs_start_transaction(root, 3);
				250	if (IS_ERR(trans)) {
				251	ret = PTR_ERR(trans);
				252	trans = NULL;
				253	goto out;
				254	}
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	255	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
				256	if (ret)
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	257	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	258	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
				259	if (ret)
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	260	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	261
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	262	write_extent_buffer(path->nodes[0], inline_data,
				263	btrfs_item_ptr_offset(path->nodes[0],
				264	path->slots[0]),
				265	size);
				266	inode_add_bytes(dst, datal);
				267	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
Filipe Manana	4fdb688	2020-04-04 21:20:22 +0100	[diff] [blame]	268	ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	269	out:
				270	if (!ret && !trans) {
				271	/*
				272	* No transaction here means we copied the inline extent into a
				273	* page of the destination inode.
				274	*
				275	* 1 unit to update inode item
				276	*/
				277	trans = btrfs_start_transaction(root, 1);
				278	if (IS_ERR(trans)) {
				279	ret = PTR_ERR(trans);
				280	trans = NULL;
				281	}
				282	}
				283	if (ret && trans) {
				284	btrfs_abort_transaction(trans, ret);
				285	btrfs_end_transaction(trans);
				286	}
				287	if (!ret)
				288	*trans_out = trans;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	289
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	290	return ret;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	291	}
				292
				293	/**
				294	* btrfs_clone() - clone a range from inode file to another
				295	*
				296	* @src: Inode to clone from
				297	* @inode: Inode to clone to
				298	* @off: Offset within source to start clone from
				299	* @olen: Original length, passed by user, of range to clone
				300	* @olen_aligned: Block-aligned value of olen
				301	* @destoff: Offset within @inode to start clone
				302	* @no_time_update: Whether to update mtime/ctime on the target inode
				303	*/
				304	static int btrfs_clone(struct inode src, struct inode inode,
				305	const u64 off, const u64 olen, const u64 olen_aligned,
				306	const u64 destoff, int no_time_update)
				307	{
				308	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	309	struct btrfs_path *path = NULL;
				310	struct extent_buffer *leaf;
				311	struct btrfs_trans_handle *trans;
				312	char *buf = NULL;
				313	struct btrfs_key key;
				314	u32 nritems;
				315	int slot;
				316	int ret;
				317	const u64 len = olen_aligned;
				318	u64 last_dest_end = destoff;
				319
				320	ret = -ENOMEM;
				321	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
				322	if (!buf)
				323	return ret;
				324
				325	path = btrfs_alloc_path();
				326	if (!path) {
				327	kvfree(buf);
				328	return ret;
				329	}
				330
				331	path->reada = READA_FORWARD;
				332	/* Clone data */
				333	key.objectid = btrfs_ino(BTRFS_I(src));
				334	key.type = BTRFS_EXTENT_DATA_KEY;
				335	key.offset = off;
				336
				337	while (1) {
				338	u64 next_key_min_offset = key.offset + 1;
				339	struct btrfs_file_extent_item *extent;
Filipe Manana	3ebac17	2020-07-15 12:30:43 +0100	[diff] [blame]	340	u64 extent_gen;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	341	int type;
				342	u32 size;
				343	struct btrfs_key new_key;
				344	u64 disko = 0, diskl = 0;
				345	u64 datao = 0, datal = 0;
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	346	u8 comp;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	347	u64 drop_start;
				348
				349	/* Note the key will change type as we walk through the tree */
				350	path->leave_spinning = 1;
				351	ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
				352	0, 0);
				353	if (ret < 0)
				354	goto out;
				355	/*
				356	* First search, if no extent item that starts at offset off was
				357	* found but the previous item is an extent item, it's possible
				358	* it might overlap our target range, therefore process it.
				359	*/
				360	if (key.offset == off && ret > 0 && path->slots[0] > 0) {
				361	btrfs_item_key_to_cpu(path->nodes[0], &key,
				362	path->slots[0] - 1);
				363	if (key.type == BTRFS_EXTENT_DATA_KEY)
				364	path->slots[0]--;
				365	}
				366
				367	nritems = btrfs_header_nritems(path->nodes[0]);
				368	process_slot:
				369	if (path->slots[0] >= nritems) {
				370	ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
				371	if (ret < 0)
				372	goto out;
				373	if (ret > 0)
				374	break;
				375	nritems = btrfs_header_nritems(path->nodes[0]);
				376	}
				377	leaf = path->nodes[0];
				378	slot = path->slots[0];
				379
				380	btrfs_item_key_to_cpu(leaf, &key, slot);
				381	if (key.type > BTRFS_EXTENT_DATA_KEY \|\|
				382	key.objectid != btrfs_ino(BTRFS_I(src)))
				383	break;
				384
				385	ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
				386
				387	extent = btrfs_item_ptr(leaf, slot,
				388	struct btrfs_file_extent_item);
Filipe Manana	3ebac17	2020-07-15 12:30:43 +0100	[diff] [blame]	389	extent_gen = btrfs_file_extent_generation(leaf, extent);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	390	comp = btrfs_file_extent_compression(leaf, extent);
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	391	type = btrfs_file_extent_type(leaf, extent);
				392	if (type == BTRFS_FILE_EXTENT_REG \|\|
				393	type == BTRFS_FILE_EXTENT_PREALLOC) {
				394	disko = btrfs_file_extent_disk_bytenr(leaf, extent);
				395	diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
				396	datao = btrfs_file_extent_offset(leaf, extent);
				397	datal = btrfs_file_extent_num_bytes(leaf, extent);
				398	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
				399	/* Take upper bound, may be compressed */
				400	datal = btrfs_file_extent_ram_bytes(leaf, extent);
				401	}
				402
				403	/*
				404	* The first search might have left us at an extent item that
				405	* ends before our target range's start, can happen if we have
				406	* holes and NO_HOLES feature enabled.
				407	*/
				408	if (key.offset + datal <= off) {
				409	path->slots[0]++;
				410	goto process_slot;
				411	} else if (key.offset >= off + len) {
				412	break;
				413	}
				414	next_key_min_offset = key.offset + datal;
				415	size = btrfs_item_size_nr(leaf, slot);
				416	read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
				417	size);
				418
				419	btrfs_release_path(path);
				420	path->leave_spinning = 0;
				421
				422	memcpy(&new_key, &key, sizeof(new_key));
				423	new_key.objectid = btrfs_ino(BTRFS_I(inode));
				424	if (off <= key.offset)
				425	new_key.offset = key.offset + destoff - off;
				426	else
				427	new_key.offset = destoff;
				428
				429	/*
				430	* Deal with a hole that doesn't have an extent item that
				431	* represents it (NO_HOLES feature enabled).
				432	* This hole is either in the middle of the cloning range or at
				433	* the beginning (fully overlaps it or partially overlaps it).
				434	*/
				435	if (new_key.offset != last_dest_end)
				436	drop_start = last_dest_end;
				437	else
				438	drop_start = new_key.offset;
				439
				440	if (type == BTRFS_FILE_EXTENT_REG \|\|
				441	type == BTRFS_FILE_EXTENT_PREALLOC) {
				442	struct btrfs_clone_extent_info clone_info;
				443
				444	/*
				445	* a \| --- range to clone ---\| b
				446	* \| ------------- extent ------------- \|
				447	*/
				448
				449	/* Subtract range b */
				450	if (key.offset + datal > off + len)
				451	datal = off + len - key.offset;
				452
				453	/* Subtract range a */
				454	if (off > key.offset) {
				455	datao += off - key.offset;
				456	datal -= off - key.offset;
				457	}
				458
				459	clone_info.disk_offset = disko;
				460	clone_info.disk_len = diskl;
				461	clone_info.data_offset = datao;
				462	clone_info.data_len = datal;
				463	clone_info.file_offset = new_key.offset;
				464	clone_info.extent_buf = buf;
				465	clone_info.item_size = size;
				466	ret = btrfs_punch_hole_range(inode, path, drop_start,
				467	new_key.offset + datal - 1, &clone_info,
				468	&trans);
				469	if (ret)
				470	goto out;
				471	} else if (type == BTRFS_FILE_EXTENT_INLINE) {
Filipe Manana	a61e1e0	2020-02-28 13:04:18 +0000	[diff] [blame]	472	/*
				473	* Inline extents always have to start at file offset 0
				474	* and can never be bigger then the sector size. We can
				475	* never clone only parts of an inline extent, since all
				476	* reflink operations must start at a sector size aligned
				477	* offset, and the length must be aligned too or end at
				478	* the i_size (which implies the whole inlined data).
				479	*/
				480	ASSERT(key.offset == 0);
				481	ASSERT(datal <= fs_info->sectorsize);
				482	if (key.offset != 0 \|\| datal > fs_info->sectorsize)
				483	return -EUCLEAN;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	484
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	485	ret = clone_copy_inline_extent(inode, path, &new_key,
				486	drop_start, datal, size,
				487	comp, buf, &trans);
				488	if (ret)
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	489	goto out;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	490	}
				491
				492	btrfs_release_path(path);
				493
Filipe Manana	3ebac17	2020-07-15 12:30:43 +0100	[diff] [blame]	494	/*
				495	* If this is a new extent update the last_reflink_trans of both
				496	* inodes. This is used by fsync to make sure it does not log
				497	* multiple checksum items with overlapping ranges. For older
				498	* extents we don't need to do it since inode logging skips the
				499	* checksums for older extents. Also ignore holes and inline
				500	* extents because they don't have checksums in the csum tree.
				501	*/
				502	if (extent_gen == trans->transid && disko > 0) {
				503	BTRFS_I(src)->last_reflink_trans = trans->transid;
				504	BTRFS_I(inode)->last_reflink_trans = trans->transid;
				505	}
				506
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	507	last_dest_end = ALIGN(new_key.offset + datal,
				508	fs_info->sectorsize);
				509	ret = clone_finish_inode_update(trans, inode, last_dest_end,
				510	destoff, olen, no_time_update);
				511	if (ret)
				512	goto out;
				513	if (new_key.offset + datal >= destoff + len)
				514	break;
				515
				516	btrfs_release_path(path);
				517	key.offset = next_key_min_offset;
				518
				519	if (fatal_signal_pending(current)) {
				520	ret = -EINTR;
				521	goto out;
				522	}
				523	}
				524	ret = 0;
				525
				526	if (last_dest_end < destoff + len) {
				527	/*
				528	* We have an implicit hole that fully or partially overlaps our
				529	* cloning range at its end. This means that we either have the
				530	* NO_HOLES feature enabled or the implicit hole happened due to
				531	* mixing buffered and direct IO writes against this file.
				532	*/
				533	btrfs_release_path(path);
				534	path->leave_spinning = 0;
				535
				536	ret = btrfs_punch_hole_range(inode, path, last_dest_end,
				537	destoff + len - 1, NULL, &trans);
				538	if (ret)
				539	goto out;
				540
				541	ret = clone_finish_inode_update(trans, inode, destoff + len,
				542	destoff, olen, no_time_update);
				543	}
				544
				545	out:
				546	btrfs_free_path(path);
				547	kvfree(buf);
				548	return ret;
				549	}
				550
				551	static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
				552	struct inode *inode2, u64 loff2, u64 len)
				553	{
				554	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				555	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				556	}
				557
				558	static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
				559	struct inode *inode2, u64 loff2, u64 len)
				560	{
				561	if (inode1 < inode2) {
				562	swap(inode1, inode2);
				563	swap(loff1, loff2);
				564	} else if (inode1 == inode2 && loff2 < loff1) {
				565	swap(loff1, loff2);
				566	}
				567	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
				568	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
				569	}
				570
				571	static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
				572	struct inode *dst, u64 dst_loff)
				573	{
				574	const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
				575	int ret;
				576
				577	/*
				578	* Lock destination range to serialize with concurrent readpages() and
				579	* source range to serialize with relocation.
				580	*/
				581	btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
				582	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
				583	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
				584
				585	return ret;
				586	}
				587
				588	static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
				589	struct inode *dst, u64 dst_loff)
				590	{
				591	int ret;
				592	u64 i, tail_len, chunk_count;
				593	struct btrfs_root *root_dst = BTRFS_I(dst)->root;
				594
				595	spin_lock(&root_dst->root_item_lock);
				596	if (root_dst->send_in_progress) {
				597	btrfs_warn_rl(root_dst->fs_info,
				598	"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
				599	root_dst->root_key.objectid,
				600	root_dst->send_in_progress);
				601	spin_unlock(&root_dst->root_item_lock);
				602	return -EAGAIN;
				603	}
				604	root_dst->dedupe_in_progress++;
				605	spin_unlock(&root_dst->root_item_lock);
				606
				607	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
				608	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
				609
				610	for (i = 0; i < chunk_count; i++) {
				611	ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
				612	dst, dst_loff);
				613	if (ret)
				614	goto out;
				615
				616	loff += BTRFS_MAX_DEDUPE_LEN;
				617	dst_loff += BTRFS_MAX_DEDUPE_LEN;
				618	}
				619
				620	if (tail_len > 0)
				621	ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
				622	out:
				623	spin_lock(&root_dst->root_item_lock);
				624	root_dst->dedupe_in_progress--;
				625	spin_unlock(&root_dst->root_item_lock);
				626
				627	return ret;
				628	}
				629
				630	static noinline int btrfs_clone_files(struct file file, struct file file_src,
				631	u64 off, u64 olen, u64 destoff)
				632	{
				633	struct inode *inode = file_inode(file);
				634	struct inode *src = file_inode(file_src);
				635	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
				636	int ret;
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	637	int wb_ret;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	638	u64 len = olen;
				639	u64 bs = fs_info->sb->s_blocksize;
				640
				641	/*
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	642	* VFS's generic_remap_file_range_prep() protects us from cloning the
				643	* eof block into the middle of a file, which would result in corruption
				644	* if the file size is not blocksize aligned. So we don't need to check
				645	* for that case here.
				646	*/
				647	if (off + len == src->i_size)
				648	len = ALIGN(src->i_size, bs) - off;
				649
				650	if (destoff > inode->i_size) {
				651	const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
				652
				653	ret = btrfs_cont_expand(inode, inode->i_size, destoff);
				654	if (ret)
				655	return ret;
				656	/*
				657	* We may have truncated the last block if the inode's size is
				658	* not sector size aligned, so we need to wait for writeback to
				659	* complete before proceeding further, otherwise we can race
				660	* with cloning and attempt to increment a reference to an
				661	* extent that no longer exists (writeback completed right after
				662	* we found the previous extent covering eof and before we
				663	* attempted to increment its reference count).
				664	*/
				665	ret = btrfs_wait_ordered_range(inode, wb_start,
				666	destoff - wb_start);
				667	if (ret)
				668	return ret;
				669	}
				670
				671	/*
				672	* Lock destination range to serialize with concurrent readpages() and
				673	* source range to serialize with relocation.
				674	*/
				675	btrfs_double_extent_lock(src, off, inode, destoff, len);
				676	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
				677	btrfs_double_extent_unlock(src, off, inode, destoff, len);
Filipe Manana	05a5a76	2020-02-28 13:04:19 +0000	[diff] [blame]	678
				679	/*
				680	* We may have copied an inline extent into a page of the destination
				681	* range, so wait for writeback to complete before truncating pages
				682	* from the page cache. This is a rare case.
				683	*/
				684	wb_ret = btrfs_wait_ordered_range(inode, destoff, len);
				685	ret = ret ? ret : wb_ret;
Filipe Manana	6a17738	2020-02-28 13:04:17 +0000	[diff] [blame]	686	/*
				687	* Truncate page cache pages so that future reads will see the cloned
				688	* data immediately and not the previous data.
				689	*/
				690	truncate_inode_pages_range(&inode->i_data,
				691	round_down(destoff, PAGE_SIZE),
				692	round_up(destoff + len, PAGE_SIZE) - 1);
				693
				694	return ret;
				695	}
				696
				697	static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
				698	struct file *file_out, loff_t pos_out,
				699	loff_t *len, unsigned int remap_flags)
				700	{
				701	struct inode *inode_in = file_inode(file_in);
				702	struct inode *inode_out = file_inode(file_out);
				703	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
				704	bool same_inode = inode_out == inode_in;
				705	u64 wb_len;
				706	int ret;
				707
				708	if (!(remap_flags & REMAP_FILE_DEDUP)) {
				709	struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
				710
				711	if (btrfs_root_readonly(root_out))
				712	return -EROFS;
				713
				714	if (file_in->f_path.mnt != file_out->f_path.mnt \|\|
				715	inode_in->i_sb != inode_out->i_sb)
				716	return -EXDEV;
				717	}
				718
				719	/* Don't make the dst file partly checksummed */
				720	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
				721	(BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
				722	return -EINVAL;
				723	}
				724
				725	/*
				726	* Now that the inodes are locked, we need to start writeback ourselves
				727	* and can not rely on the writeback from the VFS's generic helper
				728	* generic_remap_file_range_prep() because:
				729	*
				730	* 1) For compression we must call filemap_fdatawrite_range() range
				731	* twice (btrfs_fdatawrite_range() does it for us), and the generic
				732	* helper only calls it once;
				733	*
				734	* 2) filemap_fdatawrite_range(), called by the generic helper only
				735	* waits for the writeback to complete, i.e. for IO to be done, and
				736	* not for the ordered extents to complete. We need to wait for them
				737	* to complete so that new file extent items are in the fs tree.
				738	*/
				739	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
				740	wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
				741	else
				742	wb_len = ALIGN(*len, bs);
				743
				744	/*
				745	* Since we don't lock ranges, wait for ongoing lockless dio writes (as
				746	* any in progress could create its ordered extents after we wait for
				747	* existing ordered extents below).
				748	*/
				749	inode_dio_wait(inode_in);
				750	if (!same_inode)
				751	inode_dio_wait(inode_out);
				752
				753	/*
				754	* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
				755	*
				756	* Btrfs' back references do not have a block level granularity, they
				757	* work at the whole extent level.
				758	* NOCOW buffered write without data space reserved may not be able
				759	* to fall back to CoW due to lack of data space, thus could cause
				760	* data loss.
				761	*
				762	* Here we take a shortcut by flushing the whole inode, so that all
				763	* nocow write should reach disk as nocow before we increase the
				764	* reference of the extent. We could do better by only flushing NOCOW
				765	* data, but that needs extra accounting.
				766	*
				767	* Also we don't need to check ASYNC_EXTENT, as async extent will be
				768	* CoWed anyway, not affecting nocow part.
				769	*/
				770	ret = filemap_flush(inode_in->i_mapping);
				771	if (ret < 0)
				772	return ret;
				773
				774	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
				775	wb_len);
				776	if (ret < 0)
				777	return ret;
				778	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
				779	wb_len);
				780	if (ret < 0)
				781	return ret;
				782
				783	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
				784	len, remap_flags);
				785	}
				786
				787	loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
				788	struct file *dst_file, loff_t destoff, loff_t len,
				789	unsigned int remap_flags)
				790	{
				791	struct inode *src_inode = file_inode(src_file);
				792	struct inode *dst_inode = file_inode(dst_file);
				793	bool same_inode = dst_inode == src_inode;
				794	int ret;
				795
				796	if (remap_flags & ~(REMAP_FILE_DEDUP \| REMAP_FILE_ADVISORY))
				797	return -EINVAL;
				798
				799	if (same_inode)
				800	inode_lock(src_inode);
				801	else
				802	lock_two_nondirectories(src_inode, dst_inode);
				803
				804	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
				805	&len, remap_flags);
				806	if (ret < 0 \|\| len == 0)
				807	goto out_unlock;
				808
				809	if (remap_flags & REMAP_FILE_DEDUP)
				810	ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
				811	else
				812	ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
				813
				814	out_unlock:
				815	if (same_inode)
				816	inode_unlock(src_inode);
				817	else
				818	unlock_two_nondirectories(src_inode, dst_inode);
				819
				820	return ret < 0 ? ret : len;
				821	}