blob: 829adbb508372440ea2b665694eab0032ec7f661 [file] [log] [blame]
Filipe Manana6a177382020-02-28 13:04:17 +00001// SPDX-License-Identifier: GPL-2.0
2
3#include <linux/iversion.h>
4#include "ctree.h"
5#include "reflink.h"
6#include "transaction.h"
7
8#define BTRFS_MAX_DEDUPE_LEN SZ_16M
9
10static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
11 struct inode *inode,
12 u64 endoff,
13 const u64 destoff,
14 const u64 olen,
15 int no_time_update)
16{
17 struct btrfs_root *root = BTRFS_I(inode)->root;
18 int ret;
19
20 inode_inc_iversion(inode);
21 if (!no_time_update)
22 inode->i_mtime = inode->i_ctime = current_time(inode);
23 /*
24 * We round up to the block size at eof when determining which
25 * extents to clone above, but shouldn't round up the file size.
26 */
27 if (endoff > destoff + olen)
28 endoff = destoff + olen;
29 if (endoff > inode->i_size) {
30 i_size_write(inode, endoff);
31 btrfs_inode_safe_disk_i_size_write(inode, 0);
32 }
33
34 ret = btrfs_update_inode(trans, root, inode);
35 if (ret) {
36 btrfs_abort_transaction(trans, ret);
37 btrfs_end_transaction(trans);
38 goto out;
39 }
40 ret = btrfs_end_transaction(trans);
41out:
42 return ret;
43}
44
45/*
46 * Make sure we do not end up inserting an inline extent into a file that has
47 * already other (non-inline) extents. If a file has an inline extent it can
48 * not have any other extents and the (single) inline extent must start at the
49 * file offset 0. Failing to respect these rules will lead to file corruption,
50 * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
51 *
52 * We can have extents that have been already written to disk or we can have
53 * dirty ranges still in delalloc, in which case the extent maps and items are
54 * created only when we run delalloc, and the delalloc ranges might fall outside
55 * the range we are currently locking in the inode's io tree. So we check the
56 * inode's i_size because of that (i_size updates are done while holding the
57 * i_mutex, which we are holding here).
58 * We also check to see if the inode has a size not greater than "datal" but has
59 * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
60 * protected against such concurrent fallocate calls by the i_mutex).
61 *
62 * If the file has no extents but a size greater than datal, do not allow the
63 * copy because we would need turn the inline extent into a non-inline one (even
64 * with NO_HOLES enabled). If we find our destination inode only has one inline
65 * extent, just overwrite it with the source inline extent if its size is less
66 * than the source extent's size, or we could copy the source inline extent's
67 * data into the destination inode's inline extent if the later is greater then
68 * the former.
69 */
70static int clone_copy_inline_extent(struct inode *dst,
71 struct btrfs_trans_handle *trans,
72 struct btrfs_path *path,
73 struct btrfs_key *new_key,
74 const u64 drop_start,
75 const u64 datal,
Filipe Manana6a177382020-02-28 13:04:17 +000076 const u64 size,
Filipe Mananaa61e1e02020-02-28 13:04:18 +000077 const char *inline_data)
Filipe Manana6a177382020-02-28 13:04:17 +000078{
79 struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
80 struct btrfs_root *root = BTRFS_I(dst)->root;
81 const u64 aligned_end = ALIGN(new_key->offset + datal,
82 fs_info->sectorsize);
83 int ret;
84 struct btrfs_key key;
85
86 if (new_key->offset > 0)
87 return -EOPNOTSUPP;
88
89 key.objectid = btrfs_ino(BTRFS_I(dst));
90 key.type = BTRFS_EXTENT_DATA_KEY;
91 key.offset = 0;
92 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
93 if (ret < 0) {
94 return ret;
95 } else if (ret > 0) {
96 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
97 ret = btrfs_next_leaf(root, path);
98 if (ret < 0)
99 return ret;
100 else if (ret > 0)
101 goto copy_inline_extent;
102 }
103 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
104 if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
105 key.type == BTRFS_EXTENT_DATA_KEY) {
106 ASSERT(key.offset > 0);
107 return -EOPNOTSUPP;
108 }
109 } else if (i_size_read(dst) <= datal) {
110 struct btrfs_file_extent_item *ei;
111 u64 ext_len;
112
113 /*
114 * If the file size is <= datal, make sure there are no other
115 * extents following (can happen do to an fallocate call with
116 * the flag FALLOC_FL_KEEP_SIZE).
117 */
118 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
119 struct btrfs_file_extent_item);
120 /*
121 * If it's an inline extent, it can not have other extents
122 * following it.
123 */
124 if (btrfs_file_extent_type(path->nodes[0], ei) ==
125 BTRFS_FILE_EXTENT_INLINE)
126 goto copy_inline_extent;
127
128 ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
129 if (ext_len > aligned_end)
130 return -EOPNOTSUPP;
131
132 ret = btrfs_next_item(root, path);
133 if (ret < 0) {
134 return ret;
135 } else if (ret == 0) {
136 btrfs_item_key_to_cpu(path->nodes[0], &key,
137 path->slots[0]);
138 if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
139 key.type == BTRFS_EXTENT_DATA_KEY)
140 return -EOPNOTSUPP;
141 }
142 }
143
144copy_inline_extent:
145 /*
146 * We have no extent items, or we have an extent at offset 0 which may
147 * or may not be inlined. All these cases are dealt the same way.
148 */
149 if (i_size_read(dst) > datal) {
150 /*
151 * If the destination inode has an inline extent.
152 * This would require copying the data from the source inline
153 * extent into the beginning of the destination's inline extent.
154 * But this is really complex, both extents can be compressed
155 * or just one of them, which would require decompressing and
156 * re-compressing data (which could increase the new compressed
157 * size, not allowing the compressed data to fit anymore in an
158 * inline extent).
159 * So just don't support this case for now (it should be rare,
160 * we are not really saving space when cloning inline extents).
161 */
162 return -EOPNOTSUPP;
163 }
164
165 btrfs_release_path(path);
166 ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
167 if (ret)
168 return ret;
169 ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
170 if (ret)
171 return ret;
172
Filipe Manana6a177382020-02-28 13:04:17 +0000173 write_extent_buffer(path->nodes[0], inline_data,
174 btrfs_item_ptr_offset(path->nodes[0],
175 path->slots[0]),
176 size);
177 inode_add_bytes(dst, datal);
178 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
179
180 return 0;
181}
182
183/**
184 * btrfs_clone() - clone a range from inode file to another
185 *
186 * @src: Inode to clone from
187 * @inode: Inode to clone to
188 * @off: Offset within source to start clone from
189 * @olen: Original length, passed by user, of range to clone
190 * @olen_aligned: Block-aligned value of olen
191 * @destoff: Offset within @inode to start clone
192 * @no_time_update: Whether to update mtime/ctime on the target inode
193 */
194static int btrfs_clone(struct inode *src, struct inode *inode,
195 const u64 off, const u64 olen, const u64 olen_aligned,
196 const u64 destoff, int no_time_update)
197{
198 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
199 struct btrfs_root *root = BTRFS_I(inode)->root;
200 struct btrfs_path *path = NULL;
201 struct extent_buffer *leaf;
202 struct btrfs_trans_handle *trans;
203 char *buf = NULL;
204 struct btrfs_key key;
205 u32 nritems;
206 int slot;
207 int ret;
208 const u64 len = olen_aligned;
209 u64 last_dest_end = destoff;
210
211 ret = -ENOMEM;
212 buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
213 if (!buf)
214 return ret;
215
216 path = btrfs_alloc_path();
217 if (!path) {
218 kvfree(buf);
219 return ret;
220 }
221
222 path->reada = READA_FORWARD;
223 /* Clone data */
224 key.objectid = btrfs_ino(BTRFS_I(src));
225 key.type = BTRFS_EXTENT_DATA_KEY;
226 key.offset = off;
227
228 while (1) {
229 u64 next_key_min_offset = key.offset + 1;
230 struct btrfs_file_extent_item *extent;
231 int type;
232 u32 size;
233 struct btrfs_key new_key;
234 u64 disko = 0, diskl = 0;
235 u64 datao = 0, datal = 0;
Filipe Manana6a177382020-02-28 13:04:17 +0000236 u64 drop_start;
237
238 /* Note the key will change type as we walk through the tree */
239 path->leave_spinning = 1;
240 ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
241 0, 0);
242 if (ret < 0)
243 goto out;
244 /*
245 * First search, if no extent item that starts at offset off was
246 * found but the previous item is an extent item, it's possible
247 * it might overlap our target range, therefore process it.
248 */
249 if (key.offset == off && ret > 0 && path->slots[0] > 0) {
250 btrfs_item_key_to_cpu(path->nodes[0], &key,
251 path->slots[0] - 1);
252 if (key.type == BTRFS_EXTENT_DATA_KEY)
253 path->slots[0]--;
254 }
255
256 nritems = btrfs_header_nritems(path->nodes[0]);
257process_slot:
258 if (path->slots[0] >= nritems) {
259 ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
260 if (ret < 0)
261 goto out;
262 if (ret > 0)
263 break;
264 nritems = btrfs_header_nritems(path->nodes[0]);
265 }
266 leaf = path->nodes[0];
267 slot = path->slots[0];
268
269 btrfs_item_key_to_cpu(leaf, &key, slot);
270 if (key.type > BTRFS_EXTENT_DATA_KEY ||
271 key.objectid != btrfs_ino(BTRFS_I(src)))
272 break;
273
274 ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
275
276 extent = btrfs_item_ptr(leaf, slot,
277 struct btrfs_file_extent_item);
Filipe Manana6a177382020-02-28 13:04:17 +0000278 type = btrfs_file_extent_type(leaf, extent);
279 if (type == BTRFS_FILE_EXTENT_REG ||
280 type == BTRFS_FILE_EXTENT_PREALLOC) {
281 disko = btrfs_file_extent_disk_bytenr(leaf, extent);
282 diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
283 datao = btrfs_file_extent_offset(leaf, extent);
284 datal = btrfs_file_extent_num_bytes(leaf, extent);
285 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
286 /* Take upper bound, may be compressed */
287 datal = btrfs_file_extent_ram_bytes(leaf, extent);
288 }
289
290 /*
291 * The first search might have left us at an extent item that
292 * ends before our target range's start, can happen if we have
293 * holes and NO_HOLES feature enabled.
294 */
295 if (key.offset + datal <= off) {
296 path->slots[0]++;
297 goto process_slot;
298 } else if (key.offset >= off + len) {
299 break;
300 }
301 next_key_min_offset = key.offset + datal;
302 size = btrfs_item_size_nr(leaf, slot);
303 read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
304 size);
305
306 btrfs_release_path(path);
307 path->leave_spinning = 0;
308
309 memcpy(&new_key, &key, sizeof(new_key));
310 new_key.objectid = btrfs_ino(BTRFS_I(inode));
311 if (off <= key.offset)
312 new_key.offset = key.offset + destoff - off;
313 else
314 new_key.offset = destoff;
315
316 /*
317 * Deal with a hole that doesn't have an extent item that
318 * represents it (NO_HOLES feature enabled).
319 * This hole is either in the middle of the cloning range or at
320 * the beginning (fully overlaps it or partially overlaps it).
321 */
322 if (new_key.offset != last_dest_end)
323 drop_start = last_dest_end;
324 else
325 drop_start = new_key.offset;
326
327 if (type == BTRFS_FILE_EXTENT_REG ||
328 type == BTRFS_FILE_EXTENT_PREALLOC) {
329 struct btrfs_clone_extent_info clone_info;
330
331 /*
332 * a | --- range to clone ---| b
333 * | ------------- extent ------------- |
334 */
335
336 /* Subtract range b */
337 if (key.offset + datal > off + len)
338 datal = off + len - key.offset;
339
340 /* Subtract range a */
341 if (off > key.offset) {
342 datao += off - key.offset;
343 datal -= off - key.offset;
344 }
345
346 clone_info.disk_offset = disko;
347 clone_info.disk_len = diskl;
348 clone_info.data_offset = datao;
349 clone_info.data_len = datal;
350 clone_info.file_offset = new_key.offset;
351 clone_info.extent_buf = buf;
352 clone_info.item_size = size;
353 ret = btrfs_punch_hole_range(inode, path, drop_start,
354 new_key.offset + datal - 1, &clone_info,
355 &trans);
356 if (ret)
357 goto out;
358 } else if (type == BTRFS_FILE_EXTENT_INLINE) {
Filipe Mananaa61e1e02020-02-28 13:04:18 +0000359 /*
360 * Inline extents always have to start at file offset 0
361 * and can never be bigger then the sector size. We can
362 * never clone only parts of an inline extent, since all
363 * reflink operations must start at a sector size aligned
364 * offset, and the length must be aligned too or end at
365 * the i_size (which implies the whole inlined data).
366 */
367 ASSERT(key.offset == 0);
368 ASSERT(datal <= fs_info->sectorsize);
369 if (key.offset != 0 || datal > fs_info->sectorsize)
370 return -EUCLEAN;
Filipe Manana6a177382020-02-28 13:04:17 +0000371
372 /*
373 * If our extent is inline, we know we will drop or
374 * adjust at most 1 extent item in the destination root.
375 *
376 * 1 - adjusting old extent (we may have to split it)
377 * 1 - add new extent
378 * 1 - inode update
379 */
380 trans = btrfs_start_transaction(root, 3);
381 if (IS_ERR(trans)) {
382 ret = PTR_ERR(trans);
383 goto out;
384 }
385
386 ret = clone_copy_inline_extent(inode, trans, path,
387 &new_key, drop_start,
Filipe Mananaa61e1e02020-02-28 13:04:18 +0000388 datal, size, buf);
Filipe Manana6a177382020-02-28 13:04:17 +0000389 if (ret) {
390 if (ret != -EOPNOTSUPP)
391 btrfs_abort_transaction(trans, ret);
392 btrfs_end_transaction(trans);
393 goto out;
394 }
395 }
396
397 btrfs_release_path(path);
398
399 last_dest_end = ALIGN(new_key.offset + datal,
400 fs_info->sectorsize);
401 ret = clone_finish_inode_update(trans, inode, last_dest_end,
402 destoff, olen, no_time_update);
403 if (ret)
404 goto out;
405 if (new_key.offset + datal >= destoff + len)
406 break;
407
408 btrfs_release_path(path);
409 key.offset = next_key_min_offset;
410
411 if (fatal_signal_pending(current)) {
412 ret = -EINTR;
413 goto out;
414 }
415 }
416 ret = 0;
417
418 if (last_dest_end < destoff + len) {
419 /*
420 * We have an implicit hole that fully or partially overlaps our
421 * cloning range at its end. This means that we either have the
422 * NO_HOLES feature enabled or the implicit hole happened due to
423 * mixing buffered and direct IO writes against this file.
424 */
425 btrfs_release_path(path);
426 path->leave_spinning = 0;
427
428 ret = btrfs_punch_hole_range(inode, path, last_dest_end,
429 destoff + len - 1, NULL, &trans);
430 if (ret)
431 goto out;
432
433 ret = clone_finish_inode_update(trans, inode, destoff + len,
434 destoff, olen, no_time_update);
435 }
436
437out:
438 btrfs_free_path(path);
439 kvfree(buf);
440 return ret;
441}
442
443static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
444 struct inode *inode2, u64 loff2, u64 len)
445{
446 unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
447 unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
448}
449
450static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
451 struct inode *inode2, u64 loff2, u64 len)
452{
453 if (inode1 < inode2) {
454 swap(inode1, inode2);
455 swap(loff1, loff2);
456 } else if (inode1 == inode2 && loff2 < loff1) {
457 swap(loff1, loff2);
458 }
459 lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
460 lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
461}
462
463static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
464 struct inode *dst, u64 dst_loff)
465{
466 const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
467 int ret;
468
469 /*
470 * Lock destination range to serialize with concurrent readpages() and
471 * source range to serialize with relocation.
472 */
473 btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
474 ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
475 btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
476
477 return ret;
478}
479
480static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
481 struct inode *dst, u64 dst_loff)
482{
483 int ret;
484 u64 i, tail_len, chunk_count;
485 struct btrfs_root *root_dst = BTRFS_I(dst)->root;
486
487 spin_lock(&root_dst->root_item_lock);
488 if (root_dst->send_in_progress) {
489 btrfs_warn_rl(root_dst->fs_info,
490"cannot deduplicate to root %llu while send operations are using it (%d in progress)",
491 root_dst->root_key.objectid,
492 root_dst->send_in_progress);
493 spin_unlock(&root_dst->root_item_lock);
494 return -EAGAIN;
495 }
496 root_dst->dedupe_in_progress++;
497 spin_unlock(&root_dst->root_item_lock);
498
499 tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
500 chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
501
502 for (i = 0; i < chunk_count; i++) {
503 ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
504 dst, dst_loff);
505 if (ret)
506 goto out;
507
508 loff += BTRFS_MAX_DEDUPE_LEN;
509 dst_loff += BTRFS_MAX_DEDUPE_LEN;
510 }
511
512 if (tail_len > 0)
513 ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
514out:
515 spin_lock(&root_dst->root_item_lock);
516 root_dst->dedupe_in_progress--;
517 spin_unlock(&root_dst->root_item_lock);
518
519 return ret;
520}
521
522static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
523 u64 off, u64 olen, u64 destoff)
524{
525 struct inode *inode = file_inode(file);
526 struct inode *src = file_inode(file_src);
527 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
528 int ret;
529 u64 len = olen;
530 u64 bs = fs_info->sb->s_blocksize;
531
532 /*
Filipe Manana6a177382020-02-28 13:04:17 +0000533 * VFS's generic_remap_file_range_prep() protects us from cloning the
534 * eof block into the middle of a file, which would result in corruption
535 * if the file size is not blocksize aligned. So we don't need to check
536 * for that case here.
537 */
538 if (off + len == src->i_size)
539 len = ALIGN(src->i_size, bs) - off;
540
541 if (destoff > inode->i_size) {
542 const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
543
544 ret = btrfs_cont_expand(inode, inode->i_size, destoff);
545 if (ret)
546 return ret;
547 /*
548 * We may have truncated the last block if the inode's size is
549 * not sector size aligned, so we need to wait for writeback to
550 * complete before proceeding further, otherwise we can race
551 * with cloning and attempt to increment a reference to an
552 * extent that no longer exists (writeback completed right after
553 * we found the previous extent covering eof and before we
554 * attempted to increment its reference count).
555 */
556 ret = btrfs_wait_ordered_range(inode, wb_start,
557 destoff - wb_start);
558 if (ret)
559 return ret;
560 }
561
562 /*
563 * Lock destination range to serialize with concurrent readpages() and
564 * source range to serialize with relocation.
565 */
566 btrfs_double_extent_lock(src, off, inode, destoff, len);
567 ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
568 btrfs_double_extent_unlock(src, off, inode, destoff, len);
569 /*
570 * Truncate page cache pages so that future reads will see the cloned
571 * data immediately and not the previous data.
572 */
573 truncate_inode_pages_range(&inode->i_data,
574 round_down(destoff, PAGE_SIZE),
575 round_up(destoff + len, PAGE_SIZE) - 1);
576
577 return ret;
578}
579
580static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
581 struct file *file_out, loff_t pos_out,
582 loff_t *len, unsigned int remap_flags)
583{
584 struct inode *inode_in = file_inode(file_in);
585 struct inode *inode_out = file_inode(file_out);
586 u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
587 bool same_inode = inode_out == inode_in;
588 u64 wb_len;
589 int ret;
590
591 if (!(remap_flags & REMAP_FILE_DEDUP)) {
592 struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
593
594 if (btrfs_root_readonly(root_out))
595 return -EROFS;
596
597 if (file_in->f_path.mnt != file_out->f_path.mnt ||
598 inode_in->i_sb != inode_out->i_sb)
599 return -EXDEV;
600 }
601
602 /* Don't make the dst file partly checksummed */
603 if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
604 (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
605 return -EINVAL;
606 }
607
608 /*
609 * Now that the inodes are locked, we need to start writeback ourselves
610 * and can not rely on the writeback from the VFS's generic helper
611 * generic_remap_file_range_prep() because:
612 *
613 * 1) For compression we must call filemap_fdatawrite_range() range
614 * twice (btrfs_fdatawrite_range() does it for us), and the generic
615 * helper only calls it once;
616 *
617 * 2) filemap_fdatawrite_range(), called by the generic helper only
618 * waits for the writeback to complete, i.e. for IO to be done, and
619 * not for the ordered extents to complete. We need to wait for them
620 * to complete so that new file extent items are in the fs tree.
621 */
622 if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
623 wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
624 else
625 wb_len = ALIGN(*len, bs);
626
627 /*
628 * Since we don't lock ranges, wait for ongoing lockless dio writes (as
629 * any in progress could create its ordered extents after we wait for
630 * existing ordered extents below).
631 */
632 inode_dio_wait(inode_in);
633 if (!same_inode)
634 inode_dio_wait(inode_out);
635
636 /*
637 * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
638 *
639 * Btrfs' back references do not have a block level granularity, they
640 * work at the whole extent level.
641 * NOCOW buffered write without data space reserved may not be able
642 * to fall back to CoW due to lack of data space, thus could cause
643 * data loss.
644 *
645 * Here we take a shortcut by flushing the whole inode, so that all
646 * nocow write should reach disk as nocow before we increase the
647 * reference of the extent. We could do better by only flushing NOCOW
648 * data, but that needs extra accounting.
649 *
650 * Also we don't need to check ASYNC_EXTENT, as async extent will be
651 * CoWed anyway, not affecting nocow part.
652 */
653 ret = filemap_flush(inode_in->i_mapping);
654 if (ret < 0)
655 return ret;
656
657 ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
658 wb_len);
659 if (ret < 0)
660 return ret;
661 ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
662 wb_len);
663 if (ret < 0)
664 return ret;
665
666 return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
667 len, remap_flags);
668}
669
670loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
671 struct file *dst_file, loff_t destoff, loff_t len,
672 unsigned int remap_flags)
673{
674 struct inode *src_inode = file_inode(src_file);
675 struct inode *dst_inode = file_inode(dst_file);
676 bool same_inode = dst_inode == src_inode;
677 int ret;
678
679 if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
680 return -EINVAL;
681
682 if (same_inode)
683 inode_lock(src_inode);
684 else
685 lock_two_nondirectories(src_inode, dst_inode);
686
687 ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
688 &len, remap_flags);
689 if (ret < 0 || len == 0)
690 goto out_unlock;
691
692 if (remap_flags & REMAP_FILE_DEDUP)
693 ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
694 else
695 ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
696
697out_unlock:
698 if (same_inode)
699 inode_unlock(src_inode);
700 else
701 unlock_two_nondirectories(src_inode, dst_inode);
702
703 return ret < 0 ? ret : len;
704}