blob: b94a0cca4ecd5a9a6fe864715e76c32398fe6911 [file] [log] [blame]
David Sterbac1d7c512018-04-03 19:23:33 +02001// SPDX-License-Identifier: GPL-2.0
Chris Masone02119d2008-09-05 16:13:11 -04002/*
3 * Copyright (C) 2008 Oracle. All rights reserved.
Chris Masone02119d2008-09-05 16:13:11 -04004 */
5
6#include <linux/sched.h>
Tejun Heo5a0e3ad2010-03-24 17:04:11 +09007#include <linux/slab.h>
Miao Xiec6adc9c2013-05-28 10:05:39 +00008#include <linux/blkdev.h>
Josef Bacik5dc562c2012-08-17 13:14:17 -04009#include <linux/list_sort.h>
Jeff Laytonc7f88c42017-12-11 06:35:12 -050010#include <linux/iversion.h>
Nikolay Borisov9678c542018-01-08 11:45:05 +020011#include "ctree.h"
Miao Xie995946d2014-04-02 19:51:06 +080012#include "tree-log.h"
Chris Masone02119d2008-09-05 16:13:11 -040013#include "disk-io.h"
14#include "locking.h"
15#include "print-tree.h"
Mark Fashehf1863732012-08-08 11:32:27 -070016#include "backref.h"
Anand Jainebb87652016-03-10 17:26:59 +080017#include "compression.h"
Qu Wenruodf2c95f2016-08-15 10:36:52 +080018#include "qgroup.h"
Liu Bo900c9982018-01-25 11:02:56 -070019#include "inode-map.h"
Chris Masone02119d2008-09-05 16:13:11 -040020
21/* magic values for the inode_only field in btrfs_log_inode:
22 *
23 * LOG_INODE_ALL means to log everything
24 * LOG_INODE_EXISTS means to log just enough to recreate the inode
25 * during log replay
26 */
27#define LOG_INODE_ALL 0
28#define LOG_INODE_EXISTS 1
Liu Bo781feef2016-11-30 16:20:25 -080029#define LOG_OTHER_INODE 2
Filipe Mananaa3baaf02019-02-13 12:14:09 +000030#define LOG_OTHER_INODE_ALL 3
Chris Masone02119d2008-09-05 16:13:11 -040031
32/*
Chris Mason12fcfd22009-03-24 10:24:20 -040033 * directory trouble cases
34 *
35 * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
36 * log, we must force a full commit before doing an fsync of the directory
37 * where the unlink was done.
38 * ---> record transid of last unlink/rename per directory
39 *
40 * mkdir foo/some_dir
41 * normal commit
42 * rename foo/some_dir foo2/some_dir
43 * mkdir foo/some_dir
44 * fsync foo/some_dir/some_file
45 *
46 * The fsync above will unlink the original some_dir without recording
47 * it in its new location (foo2). After a crash, some_dir will be gone
48 * unless the fsync of some_file forces a full commit
49 *
50 * 2) we must log any new names for any file or dir that is in the fsync
51 * log. ---> check inode while renaming/linking.
52 *
53 * 2a) we must log any new names for any file or dir during rename
54 * when the directory they are being removed from was logged.
55 * ---> check inode and old parent dir during rename
56 *
57 * 2a is actually the more important variant. With the extra logging
58 * a crash might unlink the old name without recreating the new one
59 *
60 * 3) after a crash, we must go through any directories with a link count
61 * of zero and redo the rm -rf
62 *
63 * mkdir f1/foo
64 * normal commit
65 * rm -rf f1/foo
66 * fsync(f1)
67 *
68 * The directory f1 was fully removed from the FS, but fsync was never
69 * called on f1, only its parent dir. After a crash the rm -rf must
70 * be replayed. This must be able to recurse down the entire
71 * directory tree. The inode link count fixup code takes care of the
72 * ugly details.
73 */
74
75/*
Chris Masone02119d2008-09-05 16:13:11 -040076 * stages for the tree walking. The first
77 * stage (0) is to only pin down the blocks we find
78 * the second stage (1) is to make sure that all the inodes
79 * we find in the log are created in the subvolume.
80 *
81 * The last stage is to deal with directories and links and extents
82 * and all the other fun semantics
83 */
84#define LOG_WALK_PIN_ONLY 0
85#define LOG_WALK_REPLAY_INODES 1
Josef Bacikdd8e7212013-09-11 11:57:23 -040086#define LOG_WALK_REPLAY_DIR_INDEX 2
87#define LOG_WALK_REPLAY_ALL 3
Chris Masone02119d2008-09-05 16:13:11 -040088
Chris Mason12fcfd22009-03-24 10:24:20 -040089static int btrfs_log_inode(struct btrfs_trans_handle *trans,
Nikolay Borisova59108a2017-01-18 00:31:48 +020090 struct btrfs_root *root, struct btrfs_inode *inode,
Filipe Manana49dae1b2014-09-06 22:34:39 +010091 int inode_only,
92 const loff_t start,
Filipe Manana8407f552014-09-05 15:14:39 +010093 const loff_t end,
94 struct btrfs_log_ctx *ctx);
Yan Zhengec051c02009-01-05 15:43:42 -050095static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
96 struct btrfs_root *root,
97 struct btrfs_path *path, u64 objectid);
Chris Mason12fcfd22009-03-24 10:24:20 -040098static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
99 struct btrfs_root *root,
100 struct btrfs_root *log,
101 struct btrfs_path *path,
102 u64 dirid, int del_all);
Chris Masone02119d2008-09-05 16:13:11 -0400103
104/*
105 * tree logging is a special write ahead log used to make sure that
106 * fsyncs and O_SYNCs can happen without doing full tree commits.
107 *
108 * Full tree commits are expensive because they require commonly
109 * modified blocks to be recowed, creating many dirty pages in the
110 * extent tree an 4x-6x higher write load than ext3.
111 *
112 * Instead of doing a tree commit on every fsync, we use the
113 * key ranges and transaction ids to find items for a given file or directory
114 * that have changed in this transaction. Those items are copied into
115 * a special tree (one per subvolume root), that tree is written to disk
116 * and then the fsync is considered complete.
117 *
118 * After a crash, items are copied out of the log-tree back into the
119 * subvolume tree. Any file data extents found are recorded in the extent
120 * allocation tree, and the log-tree freed.
121 *
122 * The log tree is read three times, once to pin down all the extents it is
123 * using in ram and once, once to create all the inodes logged in the tree
124 * and once to do all the other items.
125 */
126
127/*
Chris Masone02119d2008-09-05 16:13:11 -0400128 * start a sub transaction and setup the log tree
129 * this increments the log tree writer count to make the people
130 * syncing the tree wait for us to finish
131 */
132static int start_log_trans(struct btrfs_trans_handle *trans,
Miao Xie8b050d32014-02-20 18:08:58 +0800133 struct btrfs_root *root,
134 struct btrfs_log_ctx *ctx)
Chris Masone02119d2008-09-05 16:13:11 -0400135{
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400136 struct btrfs_fs_info *fs_info = root->fs_info;
Zhaolei34eb2a52015-08-17 18:44:45 +0800137 int ret = 0;
Yan Zheng7237f182009-01-21 12:54:03 -0500138
139 mutex_lock(&root->log_mutex);
Zhaolei34eb2a52015-08-17 18:44:45 +0800140
Yan Zheng7237f182009-01-21 12:54:03 -0500141 if (root->log_root) {
David Sterba4884b8e2019-03-20 13:25:34 +0100142 if (btrfs_need_log_full_commit(trans)) {
Miao Xie50471a32014-02-20 18:08:57 +0800143 ret = -EAGAIN;
144 goto out;
145 }
Zhaolei34eb2a52015-08-17 18:44:45 +0800146
Josef Bacikff782e02009-10-08 15:30:04 -0400147 if (!root->log_start_pid) {
Miao Xie27cdeb72014-04-02 19:51:05 +0800148 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
Zhaolei34eb2a52015-08-17 18:44:45 +0800149 root->log_start_pid = current->pid;
Josef Bacikff782e02009-10-08 15:30:04 -0400150 } else if (root->log_start_pid != current->pid) {
Miao Xie27cdeb72014-04-02 19:51:05 +0800151 set_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
Josef Bacikff782e02009-10-08 15:30:04 -0400152 }
Zhaolei34eb2a52015-08-17 18:44:45 +0800153 } else {
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400154 mutex_lock(&fs_info->tree_log_mutex);
155 if (!fs_info->log_root_tree)
156 ret = btrfs_init_log_root_tree(trans, fs_info);
157 mutex_unlock(&fs_info->tree_log_mutex);
Zhaolei34eb2a52015-08-17 18:44:45 +0800158 if (ret)
159 goto out;
Josef Bacikff782e02009-10-08 15:30:04 -0400160
Chris Masone02119d2008-09-05 16:13:11 -0400161 ret = btrfs_add_log_tree(trans, root);
Yan, Zheng4a500fd2010-05-16 10:49:59 -0400162 if (ret)
Miao Xiee87ac132014-02-20 18:08:53 +0800163 goto out;
Zhaolei34eb2a52015-08-17 18:44:45 +0800164
165 clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
166 root->log_start_pid = current->pid;
Chris Masone02119d2008-09-05 16:13:11 -0400167 }
Zhaolei34eb2a52015-08-17 18:44:45 +0800168
Miao Xie2ecb7922012-09-06 04:04:27 -0600169 atomic_inc(&root->log_batch);
Yan Zheng7237f182009-01-21 12:54:03 -0500170 atomic_inc(&root->log_writers);
Miao Xie8b050d32014-02-20 18:08:58 +0800171 if (ctx) {
Zhaolei34eb2a52015-08-17 18:44:45 +0800172 int index = root->log_transid % 2;
Miao Xie8b050d32014-02-20 18:08:58 +0800173 list_add_tail(&ctx->list, &root->log_ctxs[index]);
Miao Xied1433de2014-02-20 18:08:59 +0800174 ctx->log_transid = root->log_transid;
Miao Xie8b050d32014-02-20 18:08:58 +0800175 }
Zhaolei34eb2a52015-08-17 18:44:45 +0800176
Miao Xiee87ac132014-02-20 18:08:53 +0800177out:
Yan Zheng7237f182009-01-21 12:54:03 -0500178 mutex_unlock(&root->log_mutex);
Miao Xiee87ac132014-02-20 18:08:53 +0800179 return ret;
Chris Masone02119d2008-09-05 16:13:11 -0400180}
181
182/*
183 * returns 0 if there was a log transaction running and we were able
184 * to join, or returns -ENOENT if there were not transactions
185 * in progress
186 */
187static int join_running_log_trans(struct btrfs_root *root)
188{
189 int ret = -ENOENT;
190
191 smp_mb();
192 if (!root->log_root)
193 return -ENOENT;
194
Yan Zheng7237f182009-01-21 12:54:03 -0500195 mutex_lock(&root->log_mutex);
Chris Masone02119d2008-09-05 16:13:11 -0400196 if (root->log_root) {
197 ret = 0;
Yan Zheng7237f182009-01-21 12:54:03 -0500198 atomic_inc(&root->log_writers);
Chris Masone02119d2008-09-05 16:13:11 -0400199 }
Yan Zheng7237f182009-01-21 12:54:03 -0500200 mutex_unlock(&root->log_mutex);
Chris Masone02119d2008-09-05 16:13:11 -0400201 return ret;
202}
203
204/*
Chris Mason12fcfd22009-03-24 10:24:20 -0400205 * This either makes the current running log transaction wait
206 * until you call btrfs_end_log_trans() or it makes any future
207 * log transactions wait until you call btrfs_end_log_trans()
208 */
zhong jiang45128b02018-08-17 00:37:15 +0800209void btrfs_pin_log_trans(struct btrfs_root *root)
Chris Mason12fcfd22009-03-24 10:24:20 -0400210{
Chris Mason12fcfd22009-03-24 10:24:20 -0400211 mutex_lock(&root->log_mutex);
212 atomic_inc(&root->log_writers);
213 mutex_unlock(&root->log_mutex);
Chris Mason12fcfd22009-03-24 10:24:20 -0400214}
215
216/*
Chris Masone02119d2008-09-05 16:13:11 -0400217 * indicate we're done making changes to the log tree
218 * and wake up anyone waiting to do a sync
219 */
Jeff Mahoney143bede2012-03-01 14:56:26 +0100220void btrfs_end_log_trans(struct btrfs_root *root)
Chris Masone02119d2008-09-05 16:13:11 -0400221{
Yan Zheng7237f182009-01-21 12:54:03 -0500222 if (atomic_dec_and_test(&root->log_writers)) {
David Sterba093258e2018-02-26 16:15:17 +0100223 /* atomic_dec_and_test implies a barrier */
224 cond_wake_up_nomb(&root->log_writer_wait);
Yan Zheng7237f182009-01-21 12:54:03 -0500225 }
Chris Masone02119d2008-09-05 16:13:11 -0400226}
227
David Sterba247462a2019-03-21 20:21:05 +0100228static int btrfs_write_tree_block(struct extent_buffer *buf)
229{
230 return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
231 buf->start + buf->len - 1);
232}
233
234static void btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
235{
236 filemap_fdatawait_range(buf->pages[0]->mapping,
237 buf->start, buf->start + buf->len - 1);
238}
Chris Masone02119d2008-09-05 16:13:11 -0400239
240/*
241 * the walk control struct is used to pass state down the chain when
242 * processing the log tree. The stage field tells us which part
243 * of the log tree processing we are currently doing. The others
244 * are state fields used for that specific part
245 */
246struct walk_control {
247 /* should we free the extent on disk when done? This is used
248 * at transaction commit time while freeing a log tree
249 */
250 int free;
251
252 /* should we write out the extent buffer? This is used
253 * while flushing the log tree to disk during a sync
254 */
255 int write;
256
257 /* should we wait for the extent buffer io to finish? Also used
258 * while flushing the log tree to disk for a sync
259 */
260 int wait;
261
262 /* pin only walk, we record which extents on disk belong to the
263 * log trees
264 */
265 int pin;
266
267 /* what stage of the replay code we're currently in */
268 int stage;
269
Filipe Mananaf2d72f42018-10-08 11:12:55 +0100270 /*
271 * Ignore any items from the inode currently being processed. Needs
272 * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
273 * the LOG_WALK_REPLAY_INODES stage.
274 */
275 bool ignore_cur_inode;
276
Chris Masone02119d2008-09-05 16:13:11 -0400277 /* the root we are currently replaying */
278 struct btrfs_root *replay_dest;
279
280 /* the trans handle for the current replay */
281 struct btrfs_trans_handle *trans;
282
283 /* the function that gets used to process blocks we find in the
284 * tree. Note the extent_buffer might not be up to date when it is
285 * passed in, and it must be checked or read if you need the data
286 * inside it
287 */
288 int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
Qu Wenruo581c1762018-03-29 09:08:11 +0800289 struct walk_control *wc, u64 gen, int level);
Chris Masone02119d2008-09-05 16:13:11 -0400290};
291
292/*
293 * process_func used to pin down extents, write them or wait on them
294 */
295static int process_one_buffer(struct btrfs_root *log,
296 struct extent_buffer *eb,
Qu Wenruo581c1762018-03-29 09:08:11 +0800297 struct walk_control *wc, u64 gen, int level)
Chris Masone02119d2008-09-05 16:13:11 -0400298{
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400299 struct btrfs_fs_info *fs_info = log->fs_info;
Josef Bacikb50c6e22013-04-25 15:55:30 -0400300 int ret = 0;
Chris Masone02119d2008-09-05 16:13:11 -0400301
Josef Bacik8c2a1a32013-06-06 13:19:32 -0400302 /*
303 * If this fs is mixed then we need to be able to process the leaves to
304 * pin down any logged extents, so we have to read the block.
305 */
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400306 if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
Qu Wenruo581c1762018-03-29 09:08:11 +0800307 ret = btrfs_read_buffer(eb, gen, level, NULL);
Josef Bacik8c2a1a32013-06-06 13:19:32 -0400308 if (ret)
309 return ret;
310 }
311
Josef Bacikb50c6e22013-04-25 15:55:30 -0400312 if (wc->pin)
Jeff Mahoney2ff7e612016-06-22 18:54:24 -0400313 ret = btrfs_pin_extent_for_log_replay(fs_info, eb->start,
314 eb->len);
Josef Bacikb50c6e22013-04-25 15:55:30 -0400315
316 if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
Josef Bacik8c2a1a32013-06-06 13:19:32 -0400317 if (wc->pin && btrfs_header_level(eb) == 0)
David Sterbabcdc4282019-03-20 12:14:33 +0100318 ret = btrfs_exclude_logged_extents(eb);
Chris Masone02119d2008-09-05 16:13:11 -0400319 if (wc->write)
320 btrfs_write_tree_block(eb);
321 if (wc->wait)
322 btrfs_wait_tree_block_writeback(eb);
323 }
Josef Bacikb50c6e22013-04-25 15:55:30 -0400324 return ret;
Chris Masone02119d2008-09-05 16:13:11 -0400325}
326
327/*
328 * Item overwrite used by replay and tree logging. eb, slot and key all refer
329 * to the src data we are copying out.
330 *
331 * root is the tree we are copying into, and path is a scratch
332 * path for use in this function (it should be released on entry and
333 * will be released on exit).
334 *
335 * If the key is already in the destination tree the existing item is
336 * overwritten. If the existing item isn't big enough, it is extended.
337 * If it is too large, it is truncated.
338 *
339 * If the key isn't in the destination yet, a new item is inserted.
340 */
341static noinline int overwrite_item(struct btrfs_trans_handle *trans,
342 struct btrfs_root *root,
343 struct btrfs_path *path,
344 struct extent_buffer *eb, int slot,
345 struct btrfs_key *key)
346{
Jeff Mahoney2ff7e612016-06-22 18:54:24 -0400347 struct btrfs_fs_info *fs_info = root->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -0400348 int ret;
349 u32 item_size;
350 u64 saved_i_size = 0;
351 int save_old_i_size = 0;
352 unsigned long src_ptr;
353 unsigned long dst_ptr;
354 int overwrite_root = 0;
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000355 bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
Chris Masone02119d2008-09-05 16:13:11 -0400356
357 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
358 overwrite_root = 1;
359
360 item_size = btrfs_item_size_nr(eb, slot);
361 src_ptr = btrfs_item_ptr_offset(eb, slot);
362
363 /* look for the key in the destination tree */
364 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000365 if (ret < 0)
366 return ret;
367
Chris Masone02119d2008-09-05 16:13:11 -0400368 if (ret == 0) {
369 char *src_copy;
370 char *dst_copy;
371 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
372 path->slots[0]);
373 if (dst_size != item_size)
374 goto insert;
375
376 if (item_size == 0) {
David Sterbab3b4aa72011-04-21 01:20:15 +0200377 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400378 return 0;
379 }
380 dst_copy = kmalloc(item_size, GFP_NOFS);
381 src_copy = kmalloc(item_size, GFP_NOFS);
liubo2a29edc2011-01-26 06:22:08 +0000382 if (!dst_copy || !src_copy) {
David Sterbab3b4aa72011-04-21 01:20:15 +0200383 btrfs_release_path(path);
liubo2a29edc2011-01-26 06:22:08 +0000384 kfree(dst_copy);
385 kfree(src_copy);
386 return -ENOMEM;
387 }
Chris Masone02119d2008-09-05 16:13:11 -0400388
389 read_extent_buffer(eb, src_copy, src_ptr, item_size);
390
391 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
392 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
393 item_size);
394 ret = memcmp(dst_copy, src_copy, item_size);
395
396 kfree(dst_copy);
397 kfree(src_copy);
398 /*
399 * they have the same contents, just return, this saves
400 * us from cowing blocks in the destination tree and doing
401 * extra writes that may not have been done by a previous
402 * sync
403 */
404 if (ret == 0) {
David Sterbab3b4aa72011-04-21 01:20:15 +0200405 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400406 return 0;
407 }
408
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000409 /*
410 * We need to load the old nbytes into the inode so when we
411 * replay the extents we've logged we get the right nbytes.
412 */
413 if (inode_item) {
414 struct btrfs_inode_item *item;
415 u64 nbytes;
Josef Bacikd5554382013-09-11 14:17:00 -0400416 u32 mode;
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000417
418 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
419 struct btrfs_inode_item);
420 nbytes = btrfs_inode_nbytes(path->nodes[0], item);
421 item = btrfs_item_ptr(eb, slot,
422 struct btrfs_inode_item);
423 btrfs_set_inode_nbytes(eb, item, nbytes);
Josef Bacikd5554382013-09-11 14:17:00 -0400424
425 /*
426 * If this is a directory we need to reset the i_size to
427 * 0 so that we can set it up properly when replaying
428 * the rest of the items in this log.
429 */
430 mode = btrfs_inode_mode(eb, item);
431 if (S_ISDIR(mode))
432 btrfs_set_inode_size(eb, item, 0);
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000433 }
434 } else if (inode_item) {
435 struct btrfs_inode_item *item;
Josef Bacikd5554382013-09-11 14:17:00 -0400436 u32 mode;
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000437
438 /*
439 * New inode, set nbytes to 0 so that the nbytes comes out
440 * properly when we replay the extents.
441 */
442 item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
443 btrfs_set_inode_nbytes(eb, item, 0);
Josef Bacikd5554382013-09-11 14:17:00 -0400444
445 /*
446 * If this is a directory we need to reset the i_size to 0 so
447 * that we can set it up properly when replaying the rest of
448 * the items in this log.
449 */
450 mode = btrfs_inode_mode(eb, item);
451 if (S_ISDIR(mode))
452 btrfs_set_inode_size(eb, item, 0);
Chris Masone02119d2008-09-05 16:13:11 -0400453 }
454insert:
David Sterbab3b4aa72011-04-21 01:20:15 +0200455 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400456 /* try to insert the key into the destination tree */
Filipe Mananadf8d1162015-01-14 01:52:25 +0000457 path->skip_release_on_error = 1;
Chris Masone02119d2008-09-05 16:13:11 -0400458 ret = btrfs_insert_empty_item(trans, root, path,
459 key, item_size);
Filipe Mananadf8d1162015-01-14 01:52:25 +0000460 path->skip_release_on_error = 0;
Chris Masone02119d2008-09-05 16:13:11 -0400461
462 /* make sure any existing item is the correct size */
Filipe Mananadf8d1162015-01-14 01:52:25 +0000463 if (ret == -EEXIST || ret == -EOVERFLOW) {
Chris Masone02119d2008-09-05 16:13:11 -0400464 u32 found_size;
465 found_size = btrfs_item_size_nr(path->nodes[0],
466 path->slots[0]);
Jeff Mahoney143bede2012-03-01 14:56:26 +0100467 if (found_size > item_size)
Jeff Mahoney2ff7e612016-06-22 18:54:24 -0400468 btrfs_truncate_item(fs_info, path, item_size, 1);
Jeff Mahoney143bede2012-03-01 14:56:26 +0100469 else if (found_size < item_size)
Jeff Mahoney2ff7e612016-06-22 18:54:24 -0400470 btrfs_extend_item(fs_info, path,
Jeff Mahoney143bede2012-03-01 14:56:26 +0100471 item_size - found_size);
Chris Masone02119d2008-09-05 16:13:11 -0400472 } else if (ret) {
Yan, Zheng4a500fd2010-05-16 10:49:59 -0400473 return ret;
Chris Masone02119d2008-09-05 16:13:11 -0400474 }
475 dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
476 path->slots[0]);
477
478 /* don't overwrite an existing inode if the generation number
479 * was logged as zero. This is done when the tree logging code
480 * is just logging an inode to make sure it exists after recovery.
481 *
482 * Also, don't overwrite i_size on directories during replay.
483 * log replay inserts and removes directory items based on the
484 * state of the tree found in the subvolume, and i_size is modified
485 * as it goes
486 */
487 if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
488 struct btrfs_inode_item *src_item;
489 struct btrfs_inode_item *dst_item;
490
491 src_item = (struct btrfs_inode_item *)src_ptr;
492 dst_item = (struct btrfs_inode_item *)dst_ptr;
493
Filipe Manana1a4bcf42015-02-13 12:30:56 +0000494 if (btrfs_inode_generation(eb, src_item) == 0) {
495 struct extent_buffer *dst_eb = path->nodes[0];
Filipe Manana2f2ff0e2015-03-20 17:19:46 +0000496 const u64 ino_size = btrfs_inode_size(eb, src_item);
Filipe Manana1a4bcf42015-02-13 12:30:56 +0000497
Filipe Manana2f2ff0e2015-03-20 17:19:46 +0000498 /*
499 * For regular files an ino_size == 0 is used only when
500 * logging that an inode exists, as part of a directory
501 * fsync, and the inode wasn't fsynced before. In this
502 * case don't set the size of the inode in the fs/subvol
503 * tree, otherwise we would be throwing valid data away.
504 */
Filipe Manana1a4bcf42015-02-13 12:30:56 +0000505 if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
Filipe Manana2f2ff0e2015-03-20 17:19:46 +0000506 S_ISREG(btrfs_inode_mode(dst_eb, dst_item)) &&
507 ino_size != 0) {
Filipe Manana1a4bcf42015-02-13 12:30:56 +0000508 struct btrfs_map_token token;
Filipe Manana1a4bcf42015-02-13 12:30:56 +0000509
510 btrfs_init_map_token(&token);
511 btrfs_set_token_inode_size(dst_eb, dst_item,
512 ino_size, &token);
513 }
Chris Masone02119d2008-09-05 16:13:11 -0400514 goto no_copy;
Filipe Manana1a4bcf42015-02-13 12:30:56 +0000515 }
Chris Masone02119d2008-09-05 16:13:11 -0400516
517 if (overwrite_root &&
518 S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
519 S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
520 save_old_i_size = 1;
521 saved_i_size = btrfs_inode_size(path->nodes[0],
522 dst_item);
523 }
524 }
525
526 copy_extent_buffer(path->nodes[0], eb, dst_ptr,
527 src_ptr, item_size);
528
529 if (save_old_i_size) {
530 struct btrfs_inode_item *dst_item;
531 dst_item = (struct btrfs_inode_item *)dst_ptr;
532 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
533 }
534
535 /* make sure the generation is filled in */
536 if (key->type == BTRFS_INODE_ITEM_KEY) {
537 struct btrfs_inode_item *dst_item;
538 dst_item = (struct btrfs_inode_item *)dst_ptr;
539 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
540 btrfs_set_inode_generation(path->nodes[0], dst_item,
541 trans->transid);
542 }
543 }
544no_copy:
545 btrfs_mark_buffer_dirty(path->nodes[0]);
David Sterbab3b4aa72011-04-21 01:20:15 +0200546 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400547 return 0;
548}
549
550/*
551 * simple helper to read an inode off the disk from a given root
552 * This can only be called for subvolume roots and not for the log
553 */
554static noinline struct inode *read_one_inode(struct btrfs_root *root,
555 u64 objectid)
556{
Yan Zheng5d4f98a2009-06-10 10:45:14 -0400557 struct btrfs_key key;
Chris Masone02119d2008-09-05 16:13:11 -0400558 struct inode *inode;
Chris Masone02119d2008-09-05 16:13:11 -0400559
Yan Zheng5d4f98a2009-06-10 10:45:14 -0400560 key.objectid = objectid;
561 key.type = BTRFS_INODE_ITEM_KEY;
562 key.offset = 0;
Josef Bacik73f73412009-12-04 17:38:27 +0000563 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
Al Viro2e19f1f2018-07-29 23:04:45 +0100564 if (IS_ERR(inode))
Yan Zheng5d4f98a2009-06-10 10:45:14 -0400565 inode = NULL;
Chris Masone02119d2008-09-05 16:13:11 -0400566 return inode;
567}
568
569/* replays a single extent in 'eb' at 'slot' with 'key' into the
570 * subvolume 'root'. path is released on entry and should be released
571 * on exit.
572 *
573 * extents in the log tree have not been allocated out of the extent
574 * tree yet. So, this completes the allocation, taking a reference
575 * as required if the extent already exists or creating a new extent
576 * if it isn't in the extent allocation tree yet.
577 *
578 * The extent is inserted into the file, dropping any existing extents
579 * from the file that overlap the new one.
580 */
581static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
582 struct btrfs_root *root,
583 struct btrfs_path *path,
584 struct extent_buffer *eb, int slot,
585 struct btrfs_key *key)
586{
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400587 struct btrfs_fs_info *fs_info = root->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -0400588 int found_type;
Chris Masone02119d2008-09-05 16:13:11 -0400589 u64 extent_end;
Chris Masone02119d2008-09-05 16:13:11 -0400590 u64 start = key->offset;
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000591 u64 nbytes = 0;
Chris Masone02119d2008-09-05 16:13:11 -0400592 struct btrfs_file_extent_item *item;
593 struct inode *inode = NULL;
594 unsigned long size;
595 int ret = 0;
596
597 item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
598 found_type = btrfs_file_extent_type(eb, item);
599
Yan Zhengd899e052008-10-30 14:25:28 -0400600 if (found_type == BTRFS_FILE_EXTENT_REG ||
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000601 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
602 nbytes = btrfs_file_extent_num_bytes(eb, item);
603 extent_end = start + nbytes;
604
605 /*
606 * We don't add to the inodes nbytes if we are prealloc or a
607 * hole.
608 */
609 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
610 nbytes = 0;
611 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
Qu Wenruoe41ca582018-06-06 15:41:49 +0800612 size = btrfs_file_extent_ram_bytes(eb, item);
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000613 nbytes = btrfs_file_extent_ram_bytes(eb, item);
Jeff Mahoneyda170662016-06-15 09:22:56 -0400614 extent_end = ALIGN(start + size,
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400615 fs_info->sectorsize);
Chris Masone02119d2008-09-05 16:13:11 -0400616 } else {
617 ret = 0;
618 goto out;
619 }
620
621 inode = read_one_inode(root, key->objectid);
622 if (!inode) {
623 ret = -EIO;
624 goto out;
625 }
626
627 /*
628 * first check to see if we already have this extent in the
629 * file. This must be done before the btrfs_drop_extents run
630 * so we don't try to drop this extent.
631 */
David Sterbaf85b7372017-01-20 14:54:07 +0100632 ret = btrfs_lookup_file_extent(trans, root, path,
633 btrfs_ino(BTRFS_I(inode)), start, 0);
Chris Masone02119d2008-09-05 16:13:11 -0400634
Yan Zhengd899e052008-10-30 14:25:28 -0400635 if (ret == 0 &&
636 (found_type == BTRFS_FILE_EXTENT_REG ||
637 found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
Chris Masone02119d2008-09-05 16:13:11 -0400638 struct btrfs_file_extent_item cmp1;
639 struct btrfs_file_extent_item cmp2;
640 struct btrfs_file_extent_item *existing;
641 struct extent_buffer *leaf;
642
643 leaf = path->nodes[0];
644 existing = btrfs_item_ptr(leaf, path->slots[0],
645 struct btrfs_file_extent_item);
646
647 read_extent_buffer(eb, &cmp1, (unsigned long)item,
648 sizeof(cmp1));
649 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
650 sizeof(cmp2));
651
652 /*
653 * we already have a pointer to this exact extent,
654 * we don't have to do anything
655 */
656 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
David Sterbab3b4aa72011-04-21 01:20:15 +0200657 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400658 goto out;
659 }
660 }
David Sterbab3b4aa72011-04-21 01:20:15 +0200661 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400662
663 /* drop any overlapping extents */
Josef Bacik26714852012-08-29 12:24:27 -0400664 ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
Josef Bacik36508602013-04-25 16:23:32 -0400665 if (ret)
666 goto out;
Chris Masone02119d2008-09-05 16:13:11 -0400667
Yan Zheng07d400a2009-01-06 11:42:00 -0500668 if (found_type == BTRFS_FILE_EXTENT_REG ||
669 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
Yan Zheng5d4f98a2009-06-10 10:45:14 -0400670 u64 offset;
Yan Zheng07d400a2009-01-06 11:42:00 -0500671 unsigned long dest_offset;
672 struct btrfs_key ins;
Chris Masone02119d2008-09-05 16:13:11 -0400673
Filipe Manana3168021c2017-02-01 14:58:02 +0000674 if (btrfs_file_extent_disk_bytenr(eb, item) == 0 &&
675 btrfs_fs_incompat(fs_info, NO_HOLES))
676 goto update_inode;
677
Yan Zheng07d400a2009-01-06 11:42:00 -0500678 ret = btrfs_insert_empty_item(trans, root, path, key,
679 sizeof(*item));
Josef Bacik36508602013-04-25 16:23:32 -0400680 if (ret)
681 goto out;
Yan Zheng07d400a2009-01-06 11:42:00 -0500682 dest_offset = btrfs_item_ptr_offset(path->nodes[0],
683 path->slots[0]);
684 copy_extent_buffer(path->nodes[0], eb, dest_offset,
685 (unsigned long)item, sizeof(*item));
686
687 ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
688 ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
689 ins.type = BTRFS_EXTENT_ITEM_KEY;
Yan Zheng5d4f98a2009-06-10 10:45:14 -0400690 offset = key->offset - btrfs_file_extent_offset(eb, item);
Yan Zheng07d400a2009-01-06 11:42:00 -0500691
Qu Wenruodf2c95f2016-08-15 10:36:52 +0800692 /*
693 * Manually record dirty extent, as here we did a shallow
694 * file extent item copy and skip normal backref update,
695 * but modifying extent tree all by ourselves.
696 * So need to manually record dirty extent for qgroup,
697 * as the owner of the file extent changed from log tree
698 * (doesn't affect qgroup) to fs/file tree(affects qgroup)
699 */
Lu Fengqia95f3aa2018-07-18 16:28:03 +0800700 ret = btrfs_qgroup_trace_extent(trans,
Qu Wenruodf2c95f2016-08-15 10:36:52 +0800701 btrfs_file_extent_disk_bytenr(eb, item),
702 btrfs_file_extent_disk_num_bytes(eb, item),
703 GFP_NOFS);
704 if (ret < 0)
705 goto out;
706
Yan Zheng07d400a2009-01-06 11:42:00 -0500707 if (ins.objectid > 0) {
Qu Wenruo82fa1132019-04-04 14:45:35 +0800708 struct btrfs_ref ref = { 0 };
Yan Zheng07d400a2009-01-06 11:42:00 -0500709 u64 csum_start;
710 u64 csum_end;
711 LIST_HEAD(ordered_sums);
Qu Wenruo82fa1132019-04-04 14:45:35 +0800712
Yan Zheng07d400a2009-01-06 11:42:00 -0500713 /*
714 * is this extent already allocated in the extent
715 * allocation tree? If so, just add a reference
716 */
Jeff Mahoney2ff7e612016-06-22 18:54:24 -0400717 ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
Yan Zheng07d400a2009-01-06 11:42:00 -0500718 ins.offset);
719 if (ret == 0) {
Qu Wenruo82fa1132019-04-04 14:45:35 +0800720 btrfs_init_generic_ref(&ref,
721 BTRFS_ADD_DELAYED_REF,
722 ins.objectid, ins.offset, 0);
723 btrfs_init_data_ref(&ref,
724 root->root_key.objectid,
Filipe Mananab06c4bf2015-10-23 07:52:54 +0100725 key->objectid, offset);
Qu Wenruo82fa1132019-04-04 14:45:35 +0800726 ret = btrfs_inc_extent_ref(trans, &ref);
Josef Bacikb50c6e22013-04-25 15:55:30 -0400727 if (ret)
728 goto out;
Yan Zheng07d400a2009-01-06 11:42:00 -0500729 } else {
730 /*
731 * insert the extent pointer in the extent
732 * allocation tree
733 */
Yan Zheng5d4f98a2009-06-10 10:45:14 -0400734 ret = btrfs_alloc_logged_file_extent(trans,
Jeff Mahoney2ff7e612016-06-22 18:54:24 -0400735 root->root_key.objectid,
Yan Zheng5d4f98a2009-06-10 10:45:14 -0400736 key->objectid, offset, &ins);
Josef Bacikb50c6e22013-04-25 15:55:30 -0400737 if (ret)
738 goto out;
Yan Zheng07d400a2009-01-06 11:42:00 -0500739 }
David Sterbab3b4aa72011-04-21 01:20:15 +0200740 btrfs_release_path(path);
Yan Zheng07d400a2009-01-06 11:42:00 -0500741
742 if (btrfs_file_extent_compression(eb, item)) {
743 csum_start = ins.objectid;
744 csum_end = csum_start + ins.offset;
745 } else {
746 csum_start = ins.objectid +
747 btrfs_file_extent_offset(eb, item);
748 csum_end = csum_start +
749 btrfs_file_extent_num_bytes(eb, item);
750 }
751
752 ret = btrfs_lookup_csums_range(root->log_root,
753 csum_start, csum_end - 1,
Arne Jansena2de7332011-03-08 14:14:00 +0100754 &ordered_sums, 0);
Josef Bacik36508602013-04-25 16:23:32 -0400755 if (ret)
756 goto out;
Filipe Mananab84b8392015-08-19 11:09:40 +0100757 /*
758 * Now delete all existing cums in the csum root that
759 * cover our range. We do this because we can have an
760 * extent that is completely referenced by one file
761 * extent item and partially referenced by another
762 * file extent item (like after using the clone or
763 * extent_same ioctls). In this case if we end up doing
764 * the replay of the one that partially references the
765 * extent first, and we do not do the csum deletion
766 * below, we can get 2 csum items in the csum tree that
767 * overlap each other. For example, imagine our log has
768 * the two following file extent items:
769 *
770 * key (257 EXTENT_DATA 409600)
771 * extent data disk byte 12845056 nr 102400
772 * extent data offset 20480 nr 20480 ram 102400
773 *
774 * key (257 EXTENT_DATA 819200)
775 * extent data disk byte 12845056 nr 102400
776 * extent data offset 0 nr 102400 ram 102400
777 *
778 * Where the second one fully references the 100K extent
779 * that starts at disk byte 12845056, and the log tree
780 * has a single csum item that covers the entire range
781 * of the extent:
782 *
783 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
784 *
785 * After the first file extent item is replayed, the
786 * csum tree gets the following csum item:
787 *
788 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
789 *
790 * Which covers the 20K sub-range starting at offset 20K
791 * of our extent. Now when we replay the second file
792 * extent item, if we do not delete existing csum items
793 * that cover any of its blocks, we end up getting two
794 * csum items in our csum tree that overlap each other:
795 *
796 * key (EXTENT_CSUM EXTENT_CSUM 12845056) itemsize 100
797 * key (EXTENT_CSUM EXTENT_CSUM 12865536) itemsize 20
798 *
799 * Which is a problem, because after this anyone trying
800 * to lookup up for the checksum of any block of our
801 * extent starting at an offset of 40K or higher, will
802 * end up looking at the second csum item only, which
803 * does not contain the checksum for any block starting
804 * at offset 40K or higher of our extent.
805 */
Yan Zheng07d400a2009-01-06 11:42:00 -0500806 while (!list_empty(&ordered_sums)) {
807 struct btrfs_ordered_sum *sums;
808 sums = list_entry(ordered_sums.next,
809 struct btrfs_ordered_sum,
810 list);
Josef Bacik36508602013-04-25 16:23:32 -0400811 if (!ret)
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400812 ret = btrfs_del_csums(trans, fs_info,
Jeff Mahoney5b4aace2016-06-21 10:40:19 -0400813 sums->bytenr,
814 sums->len);
Filipe Mananab84b8392015-08-19 11:09:40 +0100815 if (!ret)
Josef Bacik36508602013-04-25 16:23:32 -0400816 ret = btrfs_csum_file_blocks(trans,
Jeff Mahoney0b246af2016-06-22 18:54:23 -0400817 fs_info->csum_root, sums);
Yan Zheng07d400a2009-01-06 11:42:00 -0500818 list_del(&sums->list);
819 kfree(sums);
820 }
Josef Bacik36508602013-04-25 16:23:32 -0400821 if (ret)
822 goto out;
Yan Zheng07d400a2009-01-06 11:42:00 -0500823 } else {
David Sterbab3b4aa72011-04-21 01:20:15 +0200824 btrfs_release_path(path);
Yan Zheng07d400a2009-01-06 11:42:00 -0500825 }
826 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
827 /* inline extents are easy, we just overwrite them */
828 ret = overwrite_item(trans, root, path, eb, slot, key);
Josef Bacik36508602013-04-25 16:23:32 -0400829 if (ret)
830 goto out;
Yan Zheng07d400a2009-01-06 11:42:00 -0500831 }
832
Josef Bacik4bc4bee2013-04-05 20:50:09 +0000833 inode_add_bytes(inode, nbytes);
Filipe Manana3168021c2017-02-01 14:58:02 +0000834update_inode:
Tsutomu Itohb9959292012-06-25 21:25:22 -0600835 ret = btrfs_update_inode(trans, root, inode);
Chris Masone02119d2008-09-05 16:13:11 -0400836out:
837 if (inode)
838 iput(inode);
839 return ret;
840}
841
842/*
843 * when cleaning up conflicts between the directory names in the
844 * subvolume, directory names in the log and directory names in the
845 * inode back references, we may have to unlink inodes from directories.
846 *
847 * This is a helper function to do the unlink of a specific directory
848 * item
849 */
850static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
851 struct btrfs_root *root,
852 struct btrfs_path *path,
Nikolay Borisov207e7d92017-01-18 00:31:45 +0200853 struct btrfs_inode *dir,
Chris Masone02119d2008-09-05 16:13:11 -0400854 struct btrfs_dir_item *di)
855{
856 struct inode *inode;
857 char *name;
858 int name_len;
859 struct extent_buffer *leaf;
860 struct btrfs_key location;
861 int ret;
862
863 leaf = path->nodes[0];
864
865 btrfs_dir_item_key_to_cpu(leaf, di, &location);
866 name_len = btrfs_dir_name_len(leaf, di);
867 name = kmalloc(name_len, GFP_NOFS);
liubo2a29edc2011-01-26 06:22:08 +0000868 if (!name)
869 return -ENOMEM;
870
Chris Masone02119d2008-09-05 16:13:11 -0400871 read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
David Sterbab3b4aa72011-04-21 01:20:15 +0200872 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400873
874 inode = read_one_inode(root, location.objectid);
Tsutomu Itohc00e9492011-04-28 09:10:23 +0000875 if (!inode) {
Josef Bacik36508602013-04-25 16:23:32 -0400876 ret = -EIO;
877 goto out;
Tsutomu Itohc00e9492011-04-28 09:10:23 +0000878 }
Chris Masone02119d2008-09-05 16:13:11 -0400879
Yan Zhengec051c02009-01-05 15:43:42 -0500880 ret = link_to_fixup_dir(trans, root, path, location.objectid);
Josef Bacik36508602013-04-25 16:23:32 -0400881 if (ret)
882 goto out;
Chris Mason12fcfd22009-03-24 10:24:20 -0400883
Nikolay Borisov207e7d92017-01-18 00:31:45 +0200884 ret = btrfs_unlink_inode(trans, root, dir, BTRFS_I(inode), name,
885 name_len);
Josef Bacik36508602013-04-25 16:23:32 -0400886 if (ret)
887 goto out;
Filipe David Borba Mananaada9af22013-08-05 09:25:47 +0100888 else
Nikolay Borisove5c304e62018-02-07 17:55:43 +0200889 ret = btrfs_run_delayed_items(trans);
Josef Bacik36508602013-04-25 16:23:32 -0400890out:
891 kfree(name);
892 iput(inode);
Chris Masone02119d2008-09-05 16:13:11 -0400893 return ret;
894}
895
896/*
897 * helper function to see if a given name and sequence number found
898 * in an inode back reference are already in a directory and correctly
899 * point to this inode
900 */
901static noinline int inode_in_dir(struct btrfs_root *root,
902 struct btrfs_path *path,
903 u64 dirid, u64 objectid, u64 index,
904 const char *name, int name_len)
905{
906 struct btrfs_dir_item *di;
907 struct btrfs_key location;
908 int match = 0;
909
910 di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
911 index, name, name_len, 0);
912 if (di && !IS_ERR(di)) {
913 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
914 if (location.objectid != objectid)
915 goto out;
916 } else
917 goto out;
David Sterbab3b4aa72011-04-21 01:20:15 +0200918 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400919
920 di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
921 if (di && !IS_ERR(di)) {
922 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
923 if (location.objectid != objectid)
924 goto out;
925 } else
926 goto out;
927 match = 1;
928out:
David Sterbab3b4aa72011-04-21 01:20:15 +0200929 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -0400930 return match;
931}
932
933/*
934 * helper function to check a log tree for a named back reference in
935 * an inode. This is used to decide if a back reference that is
936 * found in the subvolume conflicts with what we find in the log.
937 *
938 * inode backreferences may have multiple refs in a single item,
939 * during replay we process one reference at a time, and we don't
940 * want to delete valid links to a file from the subvolume if that
941 * link is also in the log.
942 */
943static noinline int backref_in_log(struct btrfs_root *log,
944 struct btrfs_key *key,
Mark Fashehf1863732012-08-08 11:32:27 -0700945 u64 ref_objectid,
Filipe Mananadf8d1162015-01-14 01:52:25 +0000946 const char *name, int namelen)
Chris Masone02119d2008-09-05 16:13:11 -0400947{
948 struct btrfs_path *path;
949 struct btrfs_inode_ref *ref;
950 unsigned long ptr;
951 unsigned long ptr_end;
952 unsigned long name_ptr;
953 int found_name_len;
954 int item_size;
955 int ret;
956 int match = 0;
957
958 path = btrfs_alloc_path();
liubo2a29edc2011-01-26 06:22:08 +0000959 if (!path)
960 return -ENOMEM;
961
Chris Masone02119d2008-09-05 16:13:11 -0400962 ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
963 if (ret != 0)
964 goto out;
965
Chris Masone02119d2008-09-05 16:13:11 -0400966 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
Mark Fashehf1863732012-08-08 11:32:27 -0700967
968 if (key->type == BTRFS_INODE_EXTREF_KEY) {
Filipe Manana1f250e92018-02-28 15:56:10 +0000969 if (btrfs_find_name_in_ext_backref(path->nodes[0],
970 path->slots[0],
971 ref_objectid,
Mark Fashehf1863732012-08-08 11:32:27 -0700972 name, namelen, NULL))
973 match = 1;
974
975 goto out;
976 }
977
978 item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
Chris Masone02119d2008-09-05 16:13:11 -0400979 ptr_end = ptr + item_size;
980 while (ptr < ptr_end) {
981 ref = (struct btrfs_inode_ref *)ptr;
982 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
983 if (found_name_len == namelen) {
984 name_ptr = (unsigned long)(ref + 1);
985 ret = memcmp_extent_buffer(path->nodes[0], name,
986 name_ptr, namelen);
987 if (ret == 0) {
988 match = 1;
989 goto out;
990 }
991 }
992 ptr = (unsigned long)(ref + 1) + found_name_len;
993 }
994out:
995 btrfs_free_path(path);
996 return match;
997}
998
Jan Schmidt5a1d7842012-08-17 14:04:41 -0700999static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
1000 struct btrfs_root *root,
1001 struct btrfs_path *path,
1002 struct btrfs_root *log_root,
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001003 struct btrfs_inode *dir,
1004 struct btrfs_inode *inode,
Mark Fashehf1863732012-08-08 11:32:27 -07001005 u64 inode_objectid, u64 parent_objectid,
1006 u64 ref_index, char *name, int namelen,
1007 int *search_done)
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001008{
1009 int ret;
Mark Fashehf1863732012-08-08 11:32:27 -07001010 char *victim_name;
1011 int victim_name_len;
1012 struct extent_buffer *leaf;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001013 struct btrfs_dir_item *di;
Mark Fashehf1863732012-08-08 11:32:27 -07001014 struct btrfs_key search_key;
1015 struct btrfs_inode_extref *extref;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001016
Mark Fashehf1863732012-08-08 11:32:27 -07001017again:
1018 /* Search old style refs */
1019 search_key.objectid = inode_objectid;
1020 search_key.type = BTRFS_INODE_REF_KEY;
1021 search_key.offset = parent_objectid;
1022 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001023 if (ret == 0) {
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001024 struct btrfs_inode_ref *victim_ref;
1025 unsigned long ptr;
1026 unsigned long ptr_end;
Mark Fashehf1863732012-08-08 11:32:27 -07001027
1028 leaf = path->nodes[0];
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001029
1030 /* are we trying to overwrite a back ref for the root directory
1031 * if so, just jump out, we're done
1032 */
Mark Fashehf1863732012-08-08 11:32:27 -07001033 if (search_key.objectid == search_key.offset)
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001034 return 1;
1035
1036 /* check all the names in this back reference to see
1037 * if they are in the log. if so, we allow them to stay
1038 * otherwise they must be unlinked as a conflict
1039 */
1040 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
1041 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
1042 while (ptr < ptr_end) {
1043 victim_ref = (struct btrfs_inode_ref *)ptr;
1044 victim_name_len = btrfs_inode_ref_name_len(leaf,
1045 victim_ref);
1046 victim_name = kmalloc(victim_name_len, GFP_NOFS);
Josef Bacik36508602013-04-25 16:23:32 -04001047 if (!victim_name)
1048 return -ENOMEM;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001049
1050 read_extent_buffer(leaf, victim_name,
1051 (unsigned long)(victim_ref + 1),
1052 victim_name_len);
1053
Mark Fashehf1863732012-08-08 11:32:27 -07001054 if (!backref_in_log(log_root, &search_key,
1055 parent_objectid,
1056 victim_name,
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001057 victim_name_len)) {
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001058 inc_nlink(&inode->vfs_inode);
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001059 btrfs_release_path(path);
1060
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001061 ret = btrfs_unlink_inode(trans, root, dir, inode,
Nikolay Borisov4ec59342017-01-18 00:31:44 +02001062 victim_name, victim_name_len);
Mark Fashehf1863732012-08-08 11:32:27 -07001063 kfree(victim_name);
Josef Bacik36508602013-04-25 16:23:32 -04001064 if (ret)
1065 return ret;
Nikolay Borisove5c304e62018-02-07 17:55:43 +02001066 ret = btrfs_run_delayed_items(trans);
Filipe David Borba Mananaada9af22013-08-05 09:25:47 +01001067 if (ret)
1068 return ret;
Mark Fashehf1863732012-08-08 11:32:27 -07001069 *search_done = 1;
1070 goto again;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001071 }
1072 kfree(victim_name);
Mark Fashehf1863732012-08-08 11:32:27 -07001073
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001074 ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
1075 }
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001076
1077 /*
1078 * NOTE: we have searched root tree and checked the
Adam Buchbinderbb7ab3b2016-03-04 11:23:12 -08001079 * corresponding ref, it does not need to check again.
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001080 */
1081 *search_done = 1;
1082 }
1083 btrfs_release_path(path);
1084
Mark Fashehf1863732012-08-08 11:32:27 -07001085 /* Same search but for extended refs */
1086 extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
1087 inode_objectid, parent_objectid, 0,
1088 0);
1089 if (!IS_ERR_OR_NULL(extref)) {
1090 u32 item_size;
1091 u32 cur_offset = 0;
1092 unsigned long base;
1093 struct inode *victim_parent;
1094
1095 leaf = path->nodes[0];
1096
1097 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1098 base = btrfs_item_ptr_offset(leaf, path->slots[0]);
1099
1100 while (cur_offset < item_size) {
Quentin Casasnovasdd9ef132015-03-03 16:31:38 +01001101 extref = (struct btrfs_inode_extref *)(base + cur_offset);
Mark Fashehf1863732012-08-08 11:32:27 -07001102
1103 victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
1104
1105 if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
1106 goto next;
1107
1108 victim_name = kmalloc(victim_name_len, GFP_NOFS);
Josef Bacik36508602013-04-25 16:23:32 -04001109 if (!victim_name)
1110 return -ENOMEM;
Mark Fashehf1863732012-08-08 11:32:27 -07001111 read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
1112 victim_name_len);
1113
1114 search_key.objectid = inode_objectid;
1115 search_key.type = BTRFS_INODE_EXTREF_KEY;
1116 search_key.offset = btrfs_extref_hash(parent_objectid,
1117 victim_name,
1118 victim_name_len);
1119 ret = 0;
1120 if (!backref_in_log(log_root, &search_key,
1121 parent_objectid, victim_name,
1122 victim_name_len)) {
1123 ret = -ENOENT;
1124 victim_parent = read_one_inode(root,
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001125 parent_objectid);
Mark Fashehf1863732012-08-08 11:32:27 -07001126 if (victim_parent) {
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001127 inc_nlink(&inode->vfs_inode);
Mark Fashehf1863732012-08-08 11:32:27 -07001128 btrfs_release_path(path);
1129
1130 ret = btrfs_unlink_inode(trans, root,
Nikolay Borisov4ec59342017-01-18 00:31:44 +02001131 BTRFS_I(victim_parent),
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001132 inode,
Nikolay Borisov4ec59342017-01-18 00:31:44 +02001133 victim_name,
1134 victim_name_len);
Filipe David Borba Mananaada9af22013-08-05 09:25:47 +01001135 if (!ret)
1136 ret = btrfs_run_delayed_items(
Nikolay Borisove5c304e62018-02-07 17:55:43 +02001137 trans);
Mark Fashehf1863732012-08-08 11:32:27 -07001138 }
Mark Fashehf1863732012-08-08 11:32:27 -07001139 iput(victim_parent);
1140 kfree(victim_name);
Josef Bacik36508602013-04-25 16:23:32 -04001141 if (ret)
1142 return ret;
Mark Fashehf1863732012-08-08 11:32:27 -07001143 *search_done = 1;
1144 goto again;
1145 }
1146 kfree(victim_name);
Mark Fashehf1863732012-08-08 11:32:27 -07001147next:
1148 cur_offset += victim_name_len + sizeof(*extref);
1149 }
1150 *search_done = 1;
1151 }
1152 btrfs_release_path(path);
1153
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001154 /* look for a conflicting sequence number */
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001155 di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
Mark Fashehf1863732012-08-08 11:32:27 -07001156 ref_index, name, namelen, 0);
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001157 if (di && !IS_ERR(di)) {
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001158 ret = drop_one_dir_item(trans, root, path, dir, di);
Josef Bacik36508602013-04-25 16:23:32 -04001159 if (ret)
1160 return ret;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001161 }
1162 btrfs_release_path(path);
1163
Andrea Gelmini52042d82018-11-28 12:05:13 +01001164 /* look for a conflicting name */
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001165 di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001166 name, namelen, 0);
1167 if (di && !IS_ERR(di)) {
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001168 ret = drop_one_dir_item(trans, root, path, dir, di);
Josef Bacik36508602013-04-25 16:23:32 -04001169 if (ret)
1170 return ret;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001171 }
1172 btrfs_release_path(path);
1173
1174 return 0;
1175}
Chris Masone02119d2008-09-05 16:13:11 -04001176
Qu Wenruobae15d92017-11-08 08:54:26 +08001177static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1178 u32 *namelen, char **name, u64 *index,
1179 u64 *parent_objectid)
Mark Fashehf1863732012-08-08 11:32:27 -07001180{
1181 struct btrfs_inode_extref *extref;
1182
1183 extref = (struct btrfs_inode_extref *)ref_ptr;
1184
1185 *namelen = btrfs_inode_extref_name_len(eb, extref);
1186 *name = kmalloc(*namelen, GFP_NOFS);
1187 if (*name == NULL)
1188 return -ENOMEM;
1189
1190 read_extent_buffer(eb, *name, (unsigned long)&extref->name,
1191 *namelen);
1192
Filipe Manana1f250e92018-02-28 15:56:10 +00001193 if (index)
1194 *index = btrfs_inode_extref_index(eb, extref);
Mark Fashehf1863732012-08-08 11:32:27 -07001195 if (parent_objectid)
1196 *parent_objectid = btrfs_inode_extref_parent(eb, extref);
1197
1198 return 0;
1199}
1200
Qu Wenruobae15d92017-11-08 08:54:26 +08001201static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
1202 u32 *namelen, char **name, u64 *index)
Mark Fashehf1863732012-08-08 11:32:27 -07001203{
1204 struct btrfs_inode_ref *ref;
1205
1206 ref = (struct btrfs_inode_ref *)ref_ptr;
1207
1208 *namelen = btrfs_inode_ref_name_len(eb, ref);
1209 *name = kmalloc(*namelen, GFP_NOFS);
1210 if (*name == NULL)
1211 return -ENOMEM;
1212
1213 read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
1214
Filipe Manana1f250e92018-02-28 15:56:10 +00001215 if (index)
1216 *index = btrfs_inode_ref_index(eb, ref);
Mark Fashehf1863732012-08-08 11:32:27 -07001217
1218 return 0;
1219}
1220
Chris Masone02119d2008-09-05 16:13:11 -04001221/*
Filipe Manana1f250e92018-02-28 15:56:10 +00001222 * Take an inode reference item from the log tree and iterate all names from the
1223 * inode reference item in the subvolume tree with the same key (if it exists).
1224 * For any name that is not in the inode reference item from the log tree, do a
1225 * proper unlink of that name (that is, remove its entry from the inode
1226 * reference item and both dir index keys).
1227 */
1228static int unlink_old_inode_refs(struct btrfs_trans_handle *trans,
1229 struct btrfs_root *root,
1230 struct btrfs_path *path,
1231 struct btrfs_inode *inode,
1232 struct extent_buffer *log_eb,
1233 int log_slot,
1234 struct btrfs_key *key)
1235{
1236 int ret;
1237 unsigned long ref_ptr;
1238 unsigned long ref_end;
1239 struct extent_buffer *eb;
1240
1241again:
1242 btrfs_release_path(path);
1243 ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
1244 if (ret > 0) {
1245 ret = 0;
1246 goto out;
1247 }
1248 if (ret < 0)
1249 goto out;
1250
1251 eb = path->nodes[0];
1252 ref_ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
1253 ref_end = ref_ptr + btrfs_item_size_nr(eb, path->slots[0]);
1254 while (ref_ptr < ref_end) {
1255 char *name = NULL;
1256 int namelen;
1257 u64 parent_id;
1258
1259 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1260 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1261 NULL, &parent_id);
1262 } else {
1263 parent_id = key->offset;
1264 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1265 NULL);
1266 }
1267 if (ret)
1268 goto out;
1269
1270 if (key->type == BTRFS_INODE_EXTREF_KEY)
1271 ret = btrfs_find_name_in_ext_backref(log_eb, log_slot,
1272 parent_id, name,
1273 namelen, NULL);
1274 else
1275 ret = btrfs_find_name_in_backref(log_eb, log_slot, name,
1276 namelen, NULL);
1277
1278 if (!ret) {
1279 struct inode *dir;
1280
1281 btrfs_release_path(path);
1282 dir = read_one_inode(root, parent_id);
1283 if (!dir) {
1284 ret = -ENOENT;
1285 kfree(name);
1286 goto out;
1287 }
1288 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
1289 inode, name, namelen);
1290 kfree(name);
1291 iput(dir);
1292 if (ret)
1293 goto out;
1294 goto again;
1295 }
1296
1297 kfree(name);
1298 ref_ptr += namelen;
1299 if (key->type == BTRFS_INODE_EXTREF_KEY)
1300 ref_ptr += sizeof(struct btrfs_inode_extref);
1301 else
1302 ref_ptr += sizeof(struct btrfs_inode_ref);
1303 }
1304 ret = 0;
1305 out:
1306 btrfs_release_path(path);
1307 return ret;
1308}
1309
Filipe Manana0d836392018-07-20 10:59:06 +01001310static int btrfs_inode_ref_exists(struct inode *inode, struct inode *dir,
1311 const u8 ref_type, const char *name,
1312 const int namelen)
1313{
1314 struct btrfs_key key;
1315 struct btrfs_path *path;
1316 const u64 parent_id = btrfs_ino(BTRFS_I(dir));
1317 int ret;
1318
1319 path = btrfs_alloc_path();
1320 if (!path)
1321 return -ENOMEM;
1322
1323 key.objectid = btrfs_ino(BTRFS_I(inode));
1324 key.type = ref_type;
1325 if (key.type == BTRFS_INODE_REF_KEY)
1326 key.offset = parent_id;
1327 else
1328 key.offset = btrfs_extref_hash(parent_id, name, namelen);
1329
1330 ret = btrfs_search_slot(NULL, BTRFS_I(inode)->root, &key, path, 0, 0);
1331 if (ret < 0)
1332 goto out;
1333 if (ret > 0) {
1334 ret = 0;
1335 goto out;
1336 }
1337 if (key.type == BTRFS_INODE_EXTREF_KEY)
1338 ret = btrfs_find_name_in_ext_backref(path->nodes[0],
1339 path->slots[0], parent_id,
1340 name, namelen, NULL);
1341 else
1342 ret = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
1343 name, namelen, NULL);
1344
1345out:
1346 btrfs_free_path(path);
1347 return ret;
1348}
1349
Filipe Manana6b5fc432019-02-13 12:14:03 +00001350static int add_link(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1351 struct inode *dir, struct inode *inode, const char *name,
1352 int namelen, u64 ref_index)
1353{
1354 struct btrfs_dir_item *dir_item;
1355 struct btrfs_key key;
1356 struct btrfs_path *path;
1357 struct inode *other_inode = NULL;
1358 int ret;
1359
1360 path = btrfs_alloc_path();
1361 if (!path)
1362 return -ENOMEM;
1363
1364 dir_item = btrfs_lookup_dir_item(NULL, root, path,
1365 btrfs_ino(BTRFS_I(dir)),
1366 name, namelen, 0);
1367 if (!dir_item) {
1368 btrfs_release_path(path);
1369 goto add_link;
1370 } else if (IS_ERR(dir_item)) {
1371 ret = PTR_ERR(dir_item);
1372 goto out;
1373 }
1374
1375 /*
1376 * Our inode's dentry collides with the dentry of another inode which is
1377 * in the log but not yet processed since it has a higher inode number.
1378 * So delete that other dentry.
1379 */
1380 btrfs_dir_item_key_to_cpu(path->nodes[0], dir_item, &key);
1381 btrfs_release_path(path);
1382 other_inode = read_one_inode(root, key.objectid);
1383 if (!other_inode) {
1384 ret = -ENOENT;
1385 goto out;
1386 }
1387 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir), BTRFS_I(other_inode),
1388 name, namelen);
1389 if (ret)
1390 goto out;
1391 /*
1392 * If we dropped the link count to 0, bump it so that later the iput()
1393 * on the inode will not free it. We will fixup the link count later.
1394 */
1395 if (other_inode->i_nlink == 0)
1396 inc_nlink(other_inode);
1397
1398 ret = btrfs_run_delayed_items(trans);
1399 if (ret)
1400 goto out;
1401add_link:
1402 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
1403 name, namelen, 0, ref_index);
1404out:
1405 iput(other_inode);
1406 btrfs_free_path(path);
1407
1408 return ret;
1409}
1410
Filipe Manana1f250e92018-02-28 15:56:10 +00001411/*
Chris Masone02119d2008-09-05 16:13:11 -04001412 * replay one inode back reference item found in the log tree.
1413 * eb, slot and key refer to the buffer and key found in the log tree.
1414 * root is the destination we are replaying into, and path is for temp
1415 * use by this function. (it should be released on return).
1416 */
1417static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
1418 struct btrfs_root *root,
1419 struct btrfs_root *log,
1420 struct btrfs_path *path,
1421 struct extent_buffer *eb, int slot,
1422 struct btrfs_key *key)
1423{
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001424 struct inode *dir = NULL;
1425 struct inode *inode = NULL;
Chris Masone02119d2008-09-05 16:13:11 -04001426 unsigned long ref_ptr;
1427 unsigned long ref_end;
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001428 char *name = NULL;
liubo34f3e4f2011-08-06 08:35:23 +00001429 int namelen;
1430 int ret;
liuboc622ae62011-03-26 08:01:12 -04001431 int search_done = 0;
Mark Fashehf1863732012-08-08 11:32:27 -07001432 int log_ref_ver = 0;
1433 u64 parent_objectid;
1434 u64 inode_objectid;
Chris Masonf46dbe32012-10-09 11:17:20 -04001435 u64 ref_index = 0;
Mark Fashehf1863732012-08-08 11:32:27 -07001436 int ref_struct_size;
1437
1438 ref_ptr = btrfs_item_ptr_offset(eb, slot);
1439 ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
1440
1441 if (key->type == BTRFS_INODE_EXTREF_KEY) {
1442 struct btrfs_inode_extref *r;
1443
1444 ref_struct_size = sizeof(struct btrfs_inode_extref);
1445 log_ref_ver = 1;
1446 r = (struct btrfs_inode_extref *)ref_ptr;
1447 parent_objectid = btrfs_inode_extref_parent(eb, r);
1448 } else {
1449 ref_struct_size = sizeof(struct btrfs_inode_ref);
1450 parent_objectid = key->offset;
1451 }
1452 inode_objectid = key->objectid;
Chris Masone02119d2008-09-05 16:13:11 -04001453
Chris Masone02119d2008-09-05 16:13:11 -04001454 /*
1455 * it is possible that we didn't log all the parent directories
1456 * for a given inode. If we don't find the dir, just don't
1457 * copy the back ref in. The link count fixup code will take
1458 * care of the rest
1459 */
Mark Fashehf1863732012-08-08 11:32:27 -07001460 dir = read_one_inode(root, parent_objectid);
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001461 if (!dir) {
1462 ret = -ENOENT;
1463 goto out;
1464 }
Chris Masone02119d2008-09-05 16:13:11 -04001465
Mark Fashehf1863732012-08-08 11:32:27 -07001466 inode = read_one_inode(root, inode_objectid);
Tsutomu Itohc00e9492011-04-28 09:10:23 +00001467 if (!inode) {
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001468 ret = -EIO;
1469 goto out;
Tsutomu Itohc00e9492011-04-28 09:10:23 +00001470 }
Chris Masone02119d2008-09-05 16:13:11 -04001471
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001472 while (ref_ptr < ref_end) {
Mark Fashehf1863732012-08-08 11:32:27 -07001473 if (log_ref_ver) {
Qu Wenruobae15d92017-11-08 08:54:26 +08001474 ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
1475 &ref_index, &parent_objectid);
Mark Fashehf1863732012-08-08 11:32:27 -07001476 /*
1477 * parent object can change from one array
1478 * item to another.
1479 */
1480 if (!dir)
1481 dir = read_one_inode(root, parent_objectid);
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001482 if (!dir) {
1483 ret = -ENOENT;
1484 goto out;
1485 }
Mark Fashehf1863732012-08-08 11:32:27 -07001486 } else {
Qu Wenruobae15d92017-11-08 08:54:26 +08001487 ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
1488 &ref_index);
Mark Fashehf1863732012-08-08 11:32:27 -07001489 }
1490 if (ret)
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001491 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04001492
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001493 /* if we already have a perfect match, we're done */
David Sterbaf85b7372017-01-20 14:54:07 +01001494 if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
1495 btrfs_ino(BTRFS_I(inode)), ref_index,
1496 name, namelen)) {
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001497 /*
1498 * look for a conflicting back reference in the
1499 * metadata. if we find one we have to unlink that name
1500 * of the file before we add our new link. Later on, we
1501 * overwrite any existing back reference, and we don't
1502 * want to create dangling pointers in the directory.
1503 */
Chris Masone02119d2008-09-05 16:13:11 -04001504
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001505 if (!search_done) {
1506 ret = __add_inode_ref(trans, root, path, log,
Nikolay Borisov94c91a12017-01-18 00:31:46 +02001507 BTRFS_I(dir),
David Sterbad75eefd2017-02-10 20:20:19 +01001508 BTRFS_I(inode),
Mark Fashehf1863732012-08-08 11:32:27 -07001509 inode_objectid,
1510 parent_objectid,
1511 ref_index, name, namelen,
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001512 &search_done);
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001513 if (ret) {
1514 if (ret == 1)
1515 ret = 0;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001516 goto out;
Josef Bacik36508602013-04-25 16:23:32 -04001517 }
Chris Masone02119d2008-09-05 16:13:11 -04001518 }
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001519
Filipe Manana0d836392018-07-20 10:59:06 +01001520 /*
1521 * If a reference item already exists for this inode
1522 * with the same parent and name, but different index,
1523 * drop it and the corresponding directory index entries
1524 * from the parent before adding the new reference item
1525 * and dir index entries, otherwise we would fail with
1526 * -EEXIST returned from btrfs_add_link() below.
1527 */
1528 ret = btrfs_inode_ref_exists(inode, dir, key->type,
1529 name, namelen);
1530 if (ret > 0) {
1531 ret = btrfs_unlink_inode(trans, root,
1532 BTRFS_I(dir),
1533 BTRFS_I(inode),
1534 name, namelen);
1535 /*
1536 * If we dropped the link count to 0, bump it so
1537 * that later the iput() on the inode will not
1538 * free it. We will fixup the link count later.
1539 */
1540 if (!ret && inode->i_nlink == 0)
1541 inc_nlink(inode);
1542 }
1543 if (ret < 0)
1544 goto out;
1545
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001546 /* insert our name */
Filipe Manana6b5fc432019-02-13 12:14:03 +00001547 ret = add_link(trans, root, dir, inode, name, namelen,
1548 ref_index);
Josef Bacik36508602013-04-25 16:23:32 -04001549 if (ret)
1550 goto out;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001551
1552 btrfs_update_inode(trans, root, inode);
Chris Masone02119d2008-09-05 16:13:11 -04001553 }
liuboc622ae62011-03-26 08:01:12 -04001554
Mark Fashehf1863732012-08-08 11:32:27 -07001555 ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001556 kfree(name);
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001557 name = NULL;
Mark Fashehf1863732012-08-08 11:32:27 -07001558 if (log_ref_ver) {
1559 iput(dir);
1560 dir = NULL;
1561 }
Chris Masone02119d2008-09-05 16:13:11 -04001562 }
Chris Masone02119d2008-09-05 16:13:11 -04001563
Filipe Manana1f250e92018-02-28 15:56:10 +00001564 /*
1565 * Before we overwrite the inode reference item in the subvolume tree
1566 * with the item from the log tree, we must unlink all names from the
1567 * parent directory that are in the subvolume's tree inode reference
1568 * item, otherwise we end up with an inconsistent subvolume tree where
1569 * dir index entries exist for a name but there is no inode reference
1570 * item with the same name.
1571 */
1572 ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot,
1573 key);
1574 if (ret)
1575 goto out;
1576
Chris Masone02119d2008-09-05 16:13:11 -04001577 /* finally write the back reference in the inode */
1578 ret = overwrite_item(trans, root, path, eb, slot, key);
Jan Schmidt5a1d7842012-08-17 14:04:41 -07001579out:
David Sterbab3b4aa72011-04-21 01:20:15 +02001580 btrfs_release_path(path);
Geyslan G. Bem03b2f082013-10-11 15:35:45 -03001581 kfree(name);
Chris Masone02119d2008-09-05 16:13:11 -04001582 iput(dir);
1583 iput(inode);
Josef Bacik36508602013-04-25 16:23:32 -04001584 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04001585}
1586
Yan, Zhengc71bf092009-11-12 09:34:40 +00001587static int insert_orphan_item(struct btrfs_trans_handle *trans,
David Sterba9c4f61f2015-01-02 19:12:57 +01001588 struct btrfs_root *root, u64 ino)
Yan, Zhengc71bf092009-11-12 09:34:40 +00001589{
1590 int ret;
David Sterba381cf652015-01-02 18:45:16 +01001591
David Sterba9c4f61f2015-01-02 19:12:57 +01001592 ret = btrfs_insert_orphan_item(trans, root, ino);
1593 if (ret == -EEXIST)
1594 ret = 0;
David Sterba381cf652015-01-02 18:45:16 +01001595
Yan, Zhengc71bf092009-11-12 09:34:40 +00001596 return ret;
1597}
1598
Mark Fashehf1863732012-08-08 11:32:27 -07001599static int count_inode_extrefs(struct btrfs_root *root,
Nikolay Borisov36283652017-01-18 00:31:49 +02001600 struct btrfs_inode *inode, struct btrfs_path *path)
Chris Masone02119d2008-09-05 16:13:11 -04001601{
Mark Fashehf1863732012-08-08 11:32:27 -07001602 int ret = 0;
1603 int name_len;
1604 unsigned int nlink = 0;
1605 u32 item_size;
1606 u32 cur_offset = 0;
Nikolay Borisov36283652017-01-18 00:31:49 +02001607 u64 inode_objectid = btrfs_ino(inode);
Mark Fashehf1863732012-08-08 11:32:27 -07001608 u64 offset = 0;
1609 unsigned long ptr;
1610 struct btrfs_inode_extref *extref;
1611 struct extent_buffer *leaf;
1612
1613 while (1) {
1614 ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
1615 &extref, &offset);
1616 if (ret)
1617 break;
1618
1619 leaf = path->nodes[0];
1620 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1621 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
Filipe Manana2c2c4522015-01-13 16:40:04 +00001622 cur_offset = 0;
Mark Fashehf1863732012-08-08 11:32:27 -07001623
1624 while (cur_offset < item_size) {
1625 extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
1626 name_len = btrfs_inode_extref_name_len(leaf, extref);
1627
1628 nlink++;
1629
1630 cur_offset += name_len + sizeof(*extref);
1631 }
1632
1633 offset++;
1634 btrfs_release_path(path);
1635 }
1636 btrfs_release_path(path);
1637
Filipe Manana2c2c4522015-01-13 16:40:04 +00001638 if (ret < 0 && ret != -ENOENT)
Mark Fashehf1863732012-08-08 11:32:27 -07001639 return ret;
1640 return nlink;
1641}
1642
1643static int count_inode_refs(struct btrfs_root *root,
Nikolay Borisovf329e312017-01-18 00:31:50 +02001644 struct btrfs_inode *inode, struct btrfs_path *path)
Mark Fashehf1863732012-08-08 11:32:27 -07001645{
Chris Masone02119d2008-09-05 16:13:11 -04001646 int ret;
1647 struct btrfs_key key;
Mark Fashehf1863732012-08-08 11:32:27 -07001648 unsigned int nlink = 0;
Chris Masone02119d2008-09-05 16:13:11 -04001649 unsigned long ptr;
1650 unsigned long ptr_end;
1651 int name_len;
Nikolay Borisovf329e312017-01-18 00:31:50 +02001652 u64 ino = btrfs_ino(inode);
Chris Masone02119d2008-09-05 16:13:11 -04001653
Li Zefan33345d012011-04-20 10:31:50 +08001654 key.objectid = ino;
Chris Masone02119d2008-09-05 16:13:11 -04001655 key.type = BTRFS_INODE_REF_KEY;
1656 key.offset = (u64)-1;
1657
Chris Masond3977122009-01-05 21:25:51 -05001658 while (1) {
Chris Masone02119d2008-09-05 16:13:11 -04001659 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1660 if (ret < 0)
1661 break;
1662 if (ret > 0) {
1663 if (path->slots[0] == 0)
1664 break;
1665 path->slots[0]--;
1666 }
Filipe David Borba Mananae93ae262013-10-14 22:49:11 +01001667process_slot:
Chris Masone02119d2008-09-05 16:13:11 -04001668 btrfs_item_key_to_cpu(path->nodes[0], &key,
1669 path->slots[0]);
Li Zefan33345d012011-04-20 10:31:50 +08001670 if (key.objectid != ino ||
Chris Masone02119d2008-09-05 16:13:11 -04001671 key.type != BTRFS_INODE_REF_KEY)
1672 break;
1673 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1674 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1675 path->slots[0]);
Chris Masond3977122009-01-05 21:25:51 -05001676 while (ptr < ptr_end) {
Chris Masone02119d2008-09-05 16:13:11 -04001677 struct btrfs_inode_ref *ref;
1678
1679 ref = (struct btrfs_inode_ref *)ptr;
1680 name_len = btrfs_inode_ref_name_len(path->nodes[0],
1681 ref);
1682 ptr = (unsigned long)(ref + 1) + name_len;
1683 nlink++;
1684 }
1685
1686 if (key.offset == 0)
1687 break;
Filipe David Borba Mananae93ae262013-10-14 22:49:11 +01001688 if (path->slots[0] > 0) {
1689 path->slots[0]--;
1690 goto process_slot;
1691 }
Chris Masone02119d2008-09-05 16:13:11 -04001692 key.offset--;
David Sterbab3b4aa72011-04-21 01:20:15 +02001693 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04001694 }
David Sterbab3b4aa72011-04-21 01:20:15 +02001695 btrfs_release_path(path);
Mark Fashehf1863732012-08-08 11:32:27 -07001696
1697 return nlink;
1698}
1699
1700/*
1701 * There are a few corners where the link count of the file can't
1702 * be properly maintained during replay. So, instead of adding
1703 * lots of complexity to the log code, we just scan the backrefs
1704 * for any file that has been through replay.
1705 *
1706 * The scan will update the link count on the inode to reflect the
1707 * number of back refs found. If it goes down to zero, the iput
1708 * will free the inode.
1709 */
1710static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
1711 struct btrfs_root *root,
1712 struct inode *inode)
1713{
1714 struct btrfs_path *path;
1715 int ret;
1716 u64 nlink = 0;
Nikolay Borisov4a0cc7c2017-01-10 20:35:31 +02001717 u64 ino = btrfs_ino(BTRFS_I(inode));
Mark Fashehf1863732012-08-08 11:32:27 -07001718
1719 path = btrfs_alloc_path();
1720 if (!path)
1721 return -ENOMEM;
1722
Nikolay Borisovf329e312017-01-18 00:31:50 +02001723 ret = count_inode_refs(root, BTRFS_I(inode), path);
Mark Fashehf1863732012-08-08 11:32:27 -07001724 if (ret < 0)
1725 goto out;
1726
1727 nlink = ret;
1728
Nikolay Borisov36283652017-01-18 00:31:49 +02001729 ret = count_inode_extrefs(root, BTRFS_I(inode), path);
Mark Fashehf1863732012-08-08 11:32:27 -07001730 if (ret < 0)
1731 goto out;
1732
1733 nlink += ret;
1734
1735 ret = 0;
1736
Chris Masone02119d2008-09-05 16:13:11 -04001737 if (nlink != inode->i_nlink) {
Miklos Szeredibfe86842011-10-28 14:13:29 +02001738 set_nlink(inode, nlink);
Chris Masone02119d2008-09-05 16:13:11 -04001739 btrfs_update_inode(trans, root, inode);
1740 }
Chris Mason8d5bf1c2008-09-11 15:51:21 -04001741 BTRFS_I(inode)->index_cnt = (u64)-1;
Chris Masone02119d2008-09-05 16:13:11 -04001742
Yan, Zhengc71bf092009-11-12 09:34:40 +00001743 if (inode->i_nlink == 0) {
1744 if (S_ISDIR(inode->i_mode)) {
1745 ret = replay_dir_deletes(trans, root, NULL, path,
Li Zefan33345d012011-04-20 10:31:50 +08001746 ino, 1);
Josef Bacik36508602013-04-25 16:23:32 -04001747 if (ret)
1748 goto out;
Yan, Zhengc71bf092009-11-12 09:34:40 +00001749 }
Li Zefan33345d012011-04-20 10:31:50 +08001750 ret = insert_orphan_item(trans, root, ino);
Chris Mason12fcfd22009-03-24 10:24:20 -04001751 }
Chris Mason12fcfd22009-03-24 10:24:20 -04001752
Mark Fashehf1863732012-08-08 11:32:27 -07001753out:
1754 btrfs_free_path(path);
1755 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04001756}
1757
1758static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1759 struct btrfs_root *root,
1760 struct btrfs_path *path)
1761{
1762 int ret;
1763 struct btrfs_key key;
1764 struct inode *inode;
1765
1766 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1767 key.type = BTRFS_ORPHAN_ITEM_KEY;
1768 key.offset = (u64)-1;
Chris Masond3977122009-01-05 21:25:51 -05001769 while (1) {
Chris Masone02119d2008-09-05 16:13:11 -04001770 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1771 if (ret < 0)
1772 break;
1773
1774 if (ret == 1) {
1775 if (path->slots[0] == 0)
1776 break;
1777 path->slots[0]--;
1778 }
1779
1780 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1781 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1782 key.type != BTRFS_ORPHAN_ITEM_KEY)
1783 break;
1784
1785 ret = btrfs_del_item(trans, root, path);
Tsutomu Itoh65a246c2011-05-19 04:37:44 +00001786 if (ret)
1787 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04001788
David Sterbab3b4aa72011-04-21 01:20:15 +02001789 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04001790 inode = read_one_inode(root, key.offset);
Tsutomu Itohc00e9492011-04-28 09:10:23 +00001791 if (!inode)
1792 return -EIO;
Chris Masone02119d2008-09-05 16:13:11 -04001793
1794 ret = fixup_inode_link_count(trans, root, inode);
Chris Masone02119d2008-09-05 16:13:11 -04001795 iput(inode);
Josef Bacik36508602013-04-25 16:23:32 -04001796 if (ret)
1797 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04001798
Chris Mason12fcfd22009-03-24 10:24:20 -04001799 /*
1800 * fixup on a directory may create new entries,
1801 * make sure we always look for the highset possible
1802 * offset
1803 */
1804 key.offset = (u64)-1;
Chris Masone02119d2008-09-05 16:13:11 -04001805 }
Tsutomu Itoh65a246c2011-05-19 04:37:44 +00001806 ret = 0;
1807out:
David Sterbab3b4aa72011-04-21 01:20:15 +02001808 btrfs_release_path(path);
Tsutomu Itoh65a246c2011-05-19 04:37:44 +00001809 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04001810}
1811
1812
1813/*
1814 * record a given inode in the fixup dir so we can check its link
1815 * count when replay is done. The link count is incremented here
1816 * so the inode won't go away until we check it
1817 */
1818static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1819 struct btrfs_root *root,
1820 struct btrfs_path *path,
1821 u64 objectid)
1822{
1823 struct btrfs_key key;
1824 int ret = 0;
1825 struct inode *inode;
1826
1827 inode = read_one_inode(root, objectid);
Tsutomu Itohc00e9492011-04-28 09:10:23 +00001828 if (!inode)
1829 return -EIO;
Chris Masone02119d2008-09-05 16:13:11 -04001830
1831 key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
David Sterba962a2982014-06-04 18:41:45 +02001832 key.type = BTRFS_ORPHAN_ITEM_KEY;
Chris Masone02119d2008-09-05 16:13:11 -04001833 key.offset = objectid;
1834
1835 ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1836
David Sterbab3b4aa72011-04-21 01:20:15 +02001837 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04001838 if (ret == 0) {
Josef Bacik9bf7a482013-03-01 13:35:47 -05001839 if (!inode->i_nlink)
1840 set_nlink(inode, 1);
1841 else
Zach Brown8b558c52013-10-16 12:10:34 -07001842 inc_nlink(inode);
Tsutomu Itohb9959292012-06-25 21:25:22 -06001843 ret = btrfs_update_inode(trans, root, inode);
Chris Masone02119d2008-09-05 16:13:11 -04001844 } else if (ret == -EEXIST) {
1845 ret = 0;
1846 } else {
Josef Bacik36508602013-04-25 16:23:32 -04001847 BUG(); /* Logic Error */
Chris Masone02119d2008-09-05 16:13:11 -04001848 }
1849 iput(inode);
1850
1851 return ret;
1852}
1853
1854/*
1855 * when replaying the log for a directory, we only insert names
1856 * for inodes that actually exist. This means an fsync on a directory
1857 * does not implicitly fsync all the new files in it
1858 */
1859static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1860 struct btrfs_root *root,
Chris Masone02119d2008-09-05 16:13:11 -04001861 u64 dirid, u64 index,
Zhaolei60d53eb2015-08-17 18:44:46 +08001862 char *name, int name_len,
Chris Masone02119d2008-09-05 16:13:11 -04001863 struct btrfs_key *location)
1864{
1865 struct inode *inode;
1866 struct inode *dir;
1867 int ret;
1868
1869 inode = read_one_inode(root, location->objectid);
1870 if (!inode)
1871 return -ENOENT;
1872
1873 dir = read_one_inode(root, dirid);
1874 if (!dir) {
1875 iput(inode);
1876 return -EIO;
1877 }
Josef Bacikd5554382013-09-11 14:17:00 -04001878
Nikolay Borisovdb0a6692017-02-20 13:51:08 +02001879 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
1880 name_len, 1, index);
Chris Masone02119d2008-09-05 16:13:11 -04001881
1882 /* FIXME, put inode into FIXUP list */
1883
1884 iput(inode);
1885 iput(dir);
1886 return ret;
1887}
1888
1889/*
Filipe Mananadf8d1162015-01-14 01:52:25 +00001890 * Return true if an inode reference exists in the log for the given name,
1891 * inode and parent inode.
1892 */
1893static bool name_in_log_ref(struct btrfs_root *log_root,
1894 const char *name, const int name_len,
1895 const u64 dirid, const u64 ino)
1896{
1897 struct btrfs_key search_key;
1898
1899 search_key.objectid = ino;
1900 search_key.type = BTRFS_INODE_REF_KEY;
1901 search_key.offset = dirid;
1902 if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1903 return true;
1904
1905 search_key.type = BTRFS_INODE_EXTREF_KEY;
1906 search_key.offset = btrfs_extref_hash(dirid, name, name_len);
1907 if (backref_in_log(log_root, &search_key, dirid, name, name_len))
1908 return true;
1909
1910 return false;
1911}
1912
1913/*
Chris Masone02119d2008-09-05 16:13:11 -04001914 * take a single entry in a log directory item and replay it into
1915 * the subvolume.
1916 *
1917 * if a conflicting item exists in the subdirectory already,
1918 * the inode it points to is unlinked and put into the link count
1919 * fix up tree.
1920 *
1921 * If a name from the log points to a file or directory that does
1922 * not exist in the FS, it is skipped. fsyncs on directories
1923 * do not force down inodes inside that directory, just changes to the
1924 * names or unlinks in a directory.
Filipe Mananabb53eda2015-07-15 23:26:43 +01001925 *
1926 * Returns < 0 on error, 0 if the name wasn't replayed (dentry points to a
1927 * non-existing inode) and 1 if the name was replayed.
Chris Masone02119d2008-09-05 16:13:11 -04001928 */
1929static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1930 struct btrfs_root *root,
1931 struct btrfs_path *path,
1932 struct extent_buffer *eb,
1933 struct btrfs_dir_item *di,
1934 struct btrfs_key *key)
1935{
1936 char *name;
1937 int name_len;
1938 struct btrfs_dir_item *dst_di;
1939 struct btrfs_key found_key;
1940 struct btrfs_key log_key;
1941 struct inode *dir;
Chris Masone02119d2008-09-05 16:13:11 -04001942 u8 log_type;
Chris Mason4bef0842008-09-08 11:18:08 -04001943 int exists;
Josef Bacik36508602013-04-25 16:23:32 -04001944 int ret = 0;
Josef Bacikd5554382013-09-11 14:17:00 -04001945 bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
Filipe Mananabb53eda2015-07-15 23:26:43 +01001946 bool name_added = false;
Chris Masone02119d2008-09-05 16:13:11 -04001947
1948 dir = read_one_inode(root, key->objectid);
Tsutomu Itohc00e9492011-04-28 09:10:23 +00001949 if (!dir)
1950 return -EIO;
Chris Masone02119d2008-09-05 16:13:11 -04001951
1952 name_len = btrfs_dir_name_len(eb, di);
1953 name = kmalloc(name_len, GFP_NOFS);
Filipe David Borba Manana2bac3252013-08-04 19:58:57 +01001954 if (!name) {
1955 ret = -ENOMEM;
1956 goto out;
1957 }
liubo2a29edc2011-01-26 06:22:08 +00001958
Chris Masone02119d2008-09-05 16:13:11 -04001959 log_type = btrfs_dir_type(eb, di);
1960 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1961 name_len);
1962
1963 btrfs_dir_item_key_to_cpu(eb, di, &log_key);
Chris Mason4bef0842008-09-08 11:18:08 -04001964 exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1965 if (exists == 0)
1966 exists = 1;
1967 else
1968 exists = 0;
David Sterbab3b4aa72011-04-21 01:20:15 +02001969 btrfs_release_path(path);
Chris Mason4bef0842008-09-08 11:18:08 -04001970
Chris Masone02119d2008-09-05 16:13:11 -04001971 if (key->type == BTRFS_DIR_ITEM_KEY) {
1972 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1973 name, name_len, 1);
Chris Masond3977122009-01-05 21:25:51 -05001974 } else if (key->type == BTRFS_DIR_INDEX_KEY) {
Chris Masone02119d2008-09-05 16:13:11 -04001975 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1976 key->objectid,
1977 key->offset, name,
1978 name_len, 1);
1979 } else {
Josef Bacik36508602013-04-25 16:23:32 -04001980 /* Corruption */
1981 ret = -EINVAL;
1982 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04001983 }
David Sterbac7040052011-04-19 18:00:01 +02001984 if (IS_ERR_OR_NULL(dst_di)) {
Chris Masone02119d2008-09-05 16:13:11 -04001985 /* we need a sequence number to insert, so we only
1986 * do inserts for the BTRFS_DIR_INDEX_KEY types
1987 */
1988 if (key->type != BTRFS_DIR_INDEX_KEY)
1989 goto out;
1990 goto insert;
1991 }
1992
1993 btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1994 /* the existing item matches the logged item */
1995 if (found_key.objectid == log_key.objectid &&
1996 found_key.type == log_key.type &&
1997 found_key.offset == log_key.offset &&
1998 btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
Filipe Mananaa2cc11d2014-09-08 22:53:18 +01001999 update_size = false;
Chris Masone02119d2008-09-05 16:13:11 -04002000 goto out;
2001 }
2002
2003 /*
2004 * don't drop the conflicting directory entry if the inode
2005 * for the new entry doesn't exist
2006 */
Chris Mason4bef0842008-09-08 11:18:08 -04002007 if (!exists)
Chris Masone02119d2008-09-05 16:13:11 -04002008 goto out;
2009
Nikolay Borisov207e7d92017-01-18 00:31:45 +02002010 ret = drop_one_dir_item(trans, root, path, BTRFS_I(dir), dst_di);
Josef Bacik36508602013-04-25 16:23:32 -04002011 if (ret)
2012 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04002013
2014 if (key->type == BTRFS_DIR_INDEX_KEY)
2015 goto insert;
2016out:
David Sterbab3b4aa72011-04-21 01:20:15 +02002017 btrfs_release_path(path);
Josef Bacikd5554382013-09-11 14:17:00 -04002018 if (!ret && update_size) {
Nikolay Borisov6ef06d22017-02-20 13:50:34 +02002019 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name_len * 2);
Josef Bacikd5554382013-09-11 14:17:00 -04002020 ret = btrfs_update_inode(trans, root, dir);
2021 }
Chris Masone02119d2008-09-05 16:13:11 -04002022 kfree(name);
2023 iput(dir);
Filipe Mananabb53eda2015-07-15 23:26:43 +01002024 if (!ret && name_added)
2025 ret = 1;
Josef Bacik36508602013-04-25 16:23:32 -04002026 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04002027
2028insert:
Filipe Mananadf8d1162015-01-14 01:52:25 +00002029 if (name_in_log_ref(root->log_root, name, name_len,
2030 key->objectid, log_key.objectid)) {
2031 /* The dentry will be added later. */
2032 ret = 0;
2033 update_size = false;
2034 goto out;
2035 }
David Sterbab3b4aa72011-04-21 01:20:15 +02002036 btrfs_release_path(path);
Zhaolei60d53eb2015-08-17 18:44:46 +08002037 ret = insert_one_name(trans, root, key->objectid, key->offset,
2038 name, name_len, &log_key);
Filipe Mananadf8d1162015-01-14 01:52:25 +00002039 if (ret && ret != -ENOENT && ret != -EEXIST)
Josef Bacik36508602013-04-25 16:23:32 -04002040 goto out;
Filipe Mananabb53eda2015-07-15 23:26:43 +01002041 if (!ret)
2042 name_added = true;
Josef Bacikd5554382013-09-11 14:17:00 -04002043 update_size = false;
Josef Bacik36508602013-04-25 16:23:32 -04002044 ret = 0;
Chris Masone02119d2008-09-05 16:13:11 -04002045 goto out;
2046}
2047
2048/*
2049 * find all the names in a directory item and reconcile them into
2050 * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
2051 * one name in a directory item, but the same code gets used for
2052 * both directory index types
2053 */
2054static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
2055 struct btrfs_root *root,
2056 struct btrfs_path *path,
2057 struct extent_buffer *eb, int slot,
2058 struct btrfs_key *key)
2059{
Filipe Mananabb53eda2015-07-15 23:26:43 +01002060 int ret = 0;
Chris Masone02119d2008-09-05 16:13:11 -04002061 u32 item_size = btrfs_item_size_nr(eb, slot);
2062 struct btrfs_dir_item *di;
2063 int name_len;
2064 unsigned long ptr;
2065 unsigned long ptr_end;
Filipe Mananabb53eda2015-07-15 23:26:43 +01002066 struct btrfs_path *fixup_path = NULL;
Chris Masone02119d2008-09-05 16:13:11 -04002067
2068 ptr = btrfs_item_ptr_offset(eb, slot);
2069 ptr_end = ptr + item_size;
Chris Masond3977122009-01-05 21:25:51 -05002070 while (ptr < ptr_end) {
Chris Masone02119d2008-09-05 16:13:11 -04002071 di = (struct btrfs_dir_item *)ptr;
2072 name_len = btrfs_dir_name_len(eb, di);
2073 ret = replay_one_name(trans, root, path, eb, di, key);
Filipe Mananabb53eda2015-07-15 23:26:43 +01002074 if (ret < 0)
2075 break;
Chris Masone02119d2008-09-05 16:13:11 -04002076 ptr = (unsigned long)(di + 1);
2077 ptr += name_len;
Filipe Mananabb53eda2015-07-15 23:26:43 +01002078
2079 /*
2080 * If this entry refers to a non-directory (directories can not
2081 * have a link count > 1) and it was added in the transaction
2082 * that was not committed, make sure we fixup the link count of
2083 * the inode it the entry points to. Otherwise something like
2084 * the following would result in a directory pointing to an
2085 * inode with a wrong link that does not account for this dir
2086 * entry:
2087 *
2088 * mkdir testdir
2089 * touch testdir/foo
2090 * touch testdir/bar
2091 * sync
2092 *
2093 * ln testdir/bar testdir/bar_link
2094 * ln testdir/foo testdir/foo_link
2095 * xfs_io -c "fsync" testdir/bar
2096 *
2097 * <power failure>
2098 *
2099 * mount fs, log replay happens
2100 *
2101 * File foo would remain with a link count of 1 when it has two
2102 * entries pointing to it in the directory testdir. This would
2103 * make it impossible to ever delete the parent directory has
2104 * it would result in stale dentries that can never be deleted.
2105 */
2106 if (ret == 1 && btrfs_dir_type(eb, di) != BTRFS_FT_DIR) {
2107 struct btrfs_key di_key;
2108
2109 if (!fixup_path) {
2110 fixup_path = btrfs_alloc_path();
2111 if (!fixup_path) {
2112 ret = -ENOMEM;
2113 break;
2114 }
2115 }
2116
2117 btrfs_dir_item_key_to_cpu(eb, di, &di_key);
2118 ret = link_to_fixup_dir(trans, root, fixup_path,
2119 di_key.objectid);
2120 if (ret)
2121 break;
2122 }
2123 ret = 0;
Chris Masone02119d2008-09-05 16:13:11 -04002124 }
Filipe Mananabb53eda2015-07-15 23:26:43 +01002125 btrfs_free_path(fixup_path);
2126 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04002127}
2128
2129/*
2130 * directory replay has two parts. There are the standard directory
2131 * items in the log copied from the subvolume, and range items
2132 * created in the log while the subvolume was logged.
2133 *
2134 * The range items tell us which parts of the key space the log
2135 * is authoritative for. During replay, if a key in the subvolume
2136 * directory is in a logged range item, but not actually in the log
2137 * that means it was deleted from the directory before the fsync
2138 * and should be removed.
2139 */
2140static noinline int find_dir_range(struct btrfs_root *root,
2141 struct btrfs_path *path,
2142 u64 dirid, int key_type,
2143 u64 *start_ret, u64 *end_ret)
2144{
2145 struct btrfs_key key;
2146 u64 found_end;
2147 struct btrfs_dir_log_item *item;
2148 int ret;
2149 int nritems;
2150
2151 if (*start_ret == (u64)-1)
2152 return 1;
2153
2154 key.objectid = dirid;
2155 key.type = key_type;
2156 key.offset = *start_ret;
2157
2158 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2159 if (ret < 0)
2160 goto out;
2161 if (ret > 0) {
2162 if (path->slots[0] == 0)
2163 goto out;
2164 path->slots[0]--;
2165 }
2166 if (ret != 0)
2167 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2168
2169 if (key.type != key_type || key.objectid != dirid) {
2170 ret = 1;
2171 goto next;
2172 }
2173 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2174 struct btrfs_dir_log_item);
2175 found_end = btrfs_dir_log_end(path->nodes[0], item);
2176
2177 if (*start_ret >= key.offset && *start_ret <= found_end) {
2178 ret = 0;
2179 *start_ret = key.offset;
2180 *end_ret = found_end;
2181 goto out;
2182 }
2183 ret = 1;
2184next:
2185 /* check the next slot in the tree to see if it is a valid item */
2186 nritems = btrfs_header_nritems(path->nodes[0]);
Robbie Ko2a7bf532016-10-07 17:30:47 +08002187 path->slots[0]++;
Chris Masone02119d2008-09-05 16:13:11 -04002188 if (path->slots[0] >= nritems) {
2189 ret = btrfs_next_leaf(root, path);
2190 if (ret)
2191 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04002192 }
2193
2194 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2195
2196 if (key.type != key_type || key.objectid != dirid) {
2197 ret = 1;
2198 goto out;
2199 }
2200 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2201 struct btrfs_dir_log_item);
2202 found_end = btrfs_dir_log_end(path->nodes[0], item);
2203 *start_ret = key.offset;
2204 *end_ret = found_end;
2205 ret = 0;
2206out:
David Sterbab3b4aa72011-04-21 01:20:15 +02002207 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04002208 return ret;
2209}
2210
2211/*
2212 * this looks for a given directory item in the log. If the directory
2213 * item is not in the log, the item is removed and the inode it points
2214 * to is unlinked
2215 */
2216static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
2217 struct btrfs_root *root,
2218 struct btrfs_root *log,
2219 struct btrfs_path *path,
2220 struct btrfs_path *log_path,
2221 struct inode *dir,
2222 struct btrfs_key *dir_key)
2223{
2224 int ret;
2225 struct extent_buffer *eb;
2226 int slot;
2227 u32 item_size;
2228 struct btrfs_dir_item *di;
2229 struct btrfs_dir_item *log_di;
2230 int name_len;
2231 unsigned long ptr;
2232 unsigned long ptr_end;
2233 char *name;
2234 struct inode *inode;
2235 struct btrfs_key location;
2236
2237again:
2238 eb = path->nodes[0];
2239 slot = path->slots[0];
2240 item_size = btrfs_item_size_nr(eb, slot);
2241 ptr = btrfs_item_ptr_offset(eb, slot);
2242 ptr_end = ptr + item_size;
Chris Masond3977122009-01-05 21:25:51 -05002243 while (ptr < ptr_end) {
Chris Masone02119d2008-09-05 16:13:11 -04002244 di = (struct btrfs_dir_item *)ptr;
2245 name_len = btrfs_dir_name_len(eb, di);
2246 name = kmalloc(name_len, GFP_NOFS);
2247 if (!name) {
2248 ret = -ENOMEM;
2249 goto out;
2250 }
2251 read_extent_buffer(eb, name, (unsigned long)(di + 1),
2252 name_len);
2253 log_di = NULL;
Chris Mason12fcfd22009-03-24 10:24:20 -04002254 if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
Chris Masone02119d2008-09-05 16:13:11 -04002255 log_di = btrfs_lookup_dir_item(trans, log, log_path,
2256 dir_key->objectid,
2257 name, name_len, 0);
Chris Mason12fcfd22009-03-24 10:24:20 -04002258 } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
Chris Masone02119d2008-09-05 16:13:11 -04002259 log_di = btrfs_lookup_dir_index_item(trans, log,
2260 log_path,
2261 dir_key->objectid,
2262 dir_key->offset,
2263 name, name_len, 0);
2264 }
Al Viro8d9e2202018-07-29 23:04:46 +01002265 if (!log_di || log_di == ERR_PTR(-ENOENT)) {
Chris Masone02119d2008-09-05 16:13:11 -04002266 btrfs_dir_item_key_to_cpu(eb, di, &location);
David Sterbab3b4aa72011-04-21 01:20:15 +02002267 btrfs_release_path(path);
2268 btrfs_release_path(log_path);
Chris Masone02119d2008-09-05 16:13:11 -04002269 inode = read_one_inode(root, location.objectid);
Tsutomu Itohc00e9492011-04-28 09:10:23 +00002270 if (!inode) {
2271 kfree(name);
2272 return -EIO;
2273 }
Chris Masone02119d2008-09-05 16:13:11 -04002274
2275 ret = link_to_fixup_dir(trans, root,
2276 path, location.objectid);
Josef Bacik36508602013-04-25 16:23:32 -04002277 if (ret) {
2278 kfree(name);
2279 iput(inode);
2280 goto out;
2281 }
2282
Zach Brown8b558c52013-10-16 12:10:34 -07002283 inc_nlink(inode);
Nikolay Borisov4ec59342017-01-18 00:31:44 +02002284 ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
2285 BTRFS_I(inode), name, name_len);
Josef Bacik36508602013-04-25 16:23:32 -04002286 if (!ret)
Nikolay Borisove5c304e62018-02-07 17:55:43 +02002287 ret = btrfs_run_delayed_items(trans);
Chris Masone02119d2008-09-05 16:13:11 -04002288 kfree(name);
2289 iput(inode);
Josef Bacik36508602013-04-25 16:23:32 -04002290 if (ret)
2291 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04002292
2293 /* there might still be more names under this key
2294 * check and repeat if required
2295 */
2296 ret = btrfs_search_slot(NULL, root, dir_key, path,
2297 0, 0);
2298 if (ret == 0)
2299 goto again;
2300 ret = 0;
2301 goto out;
Filipe David Borba Manana269d0402013-10-28 17:39:21 +00002302 } else if (IS_ERR(log_di)) {
2303 kfree(name);
2304 return PTR_ERR(log_di);
Chris Masone02119d2008-09-05 16:13:11 -04002305 }
David Sterbab3b4aa72011-04-21 01:20:15 +02002306 btrfs_release_path(log_path);
Chris Masone02119d2008-09-05 16:13:11 -04002307 kfree(name);
2308
2309 ptr = (unsigned long)(di + 1);
2310 ptr += name_len;
2311 }
2312 ret = 0;
2313out:
David Sterbab3b4aa72011-04-21 01:20:15 +02002314 btrfs_release_path(path);
2315 btrfs_release_path(log_path);
Chris Masone02119d2008-09-05 16:13:11 -04002316 return ret;
2317}
2318
Filipe Manana4f764e52015-02-23 19:53:35 +00002319static int replay_xattr_deletes(struct btrfs_trans_handle *trans,
2320 struct btrfs_root *root,
2321 struct btrfs_root *log,
2322 struct btrfs_path *path,
2323 const u64 ino)
2324{
2325 struct btrfs_key search_key;
2326 struct btrfs_path *log_path;
2327 int i;
2328 int nritems;
2329 int ret;
2330
2331 log_path = btrfs_alloc_path();
2332 if (!log_path)
2333 return -ENOMEM;
2334
2335 search_key.objectid = ino;
2336 search_key.type = BTRFS_XATTR_ITEM_KEY;
2337 search_key.offset = 0;
2338again:
2339 ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
2340 if (ret < 0)
2341 goto out;
2342process_leaf:
2343 nritems = btrfs_header_nritems(path->nodes[0]);
2344 for (i = path->slots[0]; i < nritems; i++) {
2345 struct btrfs_key key;
2346 struct btrfs_dir_item *di;
2347 struct btrfs_dir_item *log_di;
2348 u32 total_size;
2349 u32 cur;
2350
2351 btrfs_item_key_to_cpu(path->nodes[0], &key, i);
2352 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) {
2353 ret = 0;
2354 goto out;
2355 }
2356
2357 di = btrfs_item_ptr(path->nodes[0], i, struct btrfs_dir_item);
2358 total_size = btrfs_item_size_nr(path->nodes[0], i);
2359 cur = 0;
2360 while (cur < total_size) {
2361 u16 name_len = btrfs_dir_name_len(path->nodes[0], di);
2362 u16 data_len = btrfs_dir_data_len(path->nodes[0], di);
2363 u32 this_len = sizeof(*di) + name_len + data_len;
2364 char *name;
2365
2366 name = kmalloc(name_len, GFP_NOFS);
2367 if (!name) {
2368 ret = -ENOMEM;
2369 goto out;
2370 }
2371 read_extent_buffer(path->nodes[0], name,
2372 (unsigned long)(di + 1), name_len);
2373
2374 log_di = btrfs_lookup_xattr(NULL, log, log_path, ino,
2375 name, name_len, 0);
2376 btrfs_release_path(log_path);
2377 if (!log_di) {
2378 /* Doesn't exist in log tree, so delete it. */
2379 btrfs_release_path(path);
2380 di = btrfs_lookup_xattr(trans, root, path, ino,
2381 name, name_len, -1);
2382 kfree(name);
2383 if (IS_ERR(di)) {
2384 ret = PTR_ERR(di);
2385 goto out;
2386 }
2387 ASSERT(di);
2388 ret = btrfs_delete_one_dir_name(trans, root,
2389 path, di);
2390 if (ret)
2391 goto out;
2392 btrfs_release_path(path);
2393 search_key = key;
2394 goto again;
2395 }
2396 kfree(name);
2397 if (IS_ERR(log_di)) {
2398 ret = PTR_ERR(log_di);
2399 goto out;
2400 }
2401 cur += this_len;
2402 di = (struct btrfs_dir_item *)((char *)di + this_len);
2403 }
2404 }
2405 ret = btrfs_next_leaf(root, path);
2406 if (ret > 0)
2407 ret = 0;
2408 else if (ret == 0)
2409 goto process_leaf;
2410out:
2411 btrfs_free_path(log_path);
2412 btrfs_release_path(path);
2413 return ret;
2414}
2415
2416
Chris Masone02119d2008-09-05 16:13:11 -04002417/*
2418 * deletion replay happens before we copy any new directory items
2419 * out of the log or out of backreferences from inodes. It
2420 * scans the log to find ranges of keys that log is authoritative for,
2421 * and then scans the directory to find items in those ranges that are
2422 * not present in the log.
2423 *
2424 * Anything we don't find in the log is unlinked and removed from the
2425 * directory.
2426 */
2427static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
2428 struct btrfs_root *root,
2429 struct btrfs_root *log,
2430 struct btrfs_path *path,
Chris Mason12fcfd22009-03-24 10:24:20 -04002431 u64 dirid, int del_all)
Chris Masone02119d2008-09-05 16:13:11 -04002432{
2433 u64 range_start;
2434 u64 range_end;
2435 int key_type = BTRFS_DIR_LOG_ITEM_KEY;
2436 int ret = 0;
2437 struct btrfs_key dir_key;
2438 struct btrfs_key found_key;
2439 struct btrfs_path *log_path;
2440 struct inode *dir;
2441
2442 dir_key.objectid = dirid;
2443 dir_key.type = BTRFS_DIR_ITEM_KEY;
2444 log_path = btrfs_alloc_path();
2445 if (!log_path)
2446 return -ENOMEM;
2447
2448 dir = read_one_inode(root, dirid);
2449 /* it isn't an error if the inode isn't there, that can happen
2450 * because we replay the deletes before we copy in the inode item
2451 * from the log
2452 */
2453 if (!dir) {
2454 btrfs_free_path(log_path);
2455 return 0;
2456 }
2457again:
2458 range_start = 0;
2459 range_end = 0;
Chris Masond3977122009-01-05 21:25:51 -05002460 while (1) {
Chris Mason12fcfd22009-03-24 10:24:20 -04002461 if (del_all)
2462 range_end = (u64)-1;
2463 else {
2464 ret = find_dir_range(log, path, dirid, key_type,
2465 &range_start, &range_end);
2466 if (ret != 0)
2467 break;
2468 }
Chris Masone02119d2008-09-05 16:13:11 -04002469
2470 dir_key.offset = range_start;
Chris Masond3977122009-01-05 21:25:51 -05002471 while (1) {
Chris Masone02119d2008-09-05 16:13:11 -04002472 int nritems;
2473 ret = btrfs_search_slot(NULL, root, &dir_key, path,
2474 0, 0);
2475 if (ret < 0)
2476 goto out;
2477
2478 nritems = btrfs_header_nritems(path->nodes[0]);
2479 if (path->slots[0] >= nritems) {
2480 ret = btrfs_next_leaf(root, path);
Liu Bob98def72018-04-03 01:59:48 +08002481 if (ret == 1)
Chris Masone02119d2008-09-05 16:13:11 -04002482 break;
Liu Bob98def72018-04-03 01:59:48 +08002483 else if (ret < 0)
2484 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04002485 }
2486 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2487 path->slots[0]);
2488 if (found_key.objectid != dirid ||
2489 found_key.type != dir_key.type)
2490 goto next_type;
2491
2492 if (found_key.offset > range_end)
2493 break;
2494
2495 ret = check_item_in_log(trans, root, log, path,
Chris Mason12fcfd22009-03-24 10:24:20 -04002496 log_path, dir,
2497 &found_key);
Josef Bacik36508602013-04-25 16:23:32 -04002498 if (ret)
2499 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04002500 if (found_key.offset == (u64)-1)
2501 break;
2502 dir_key.offset = found_key.offset + 1;
2503 }
David Sterbab3b4aa72011-04-21 01:20:15 +02002504 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04002505 if (range_end == (u64)-1)
2506 break;
2507 range_start = range_end + 1;
2508 }
2509
2510next_type:
2511 ret = 0;
2512 if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
2513 key_type = BTRFS_DIR_LOG_INDEX_KEY;
2514 dir_key.type = BTRFS_DIR_INDEX_KEY;
David Sterbab3b4aa72011-04-21 01:20:15 +02002515 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04002516 goto again;
2517 }
2518out:
David Sterbab3b4aa72011-04-21 01:20:15 +02002519 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04002520 btrfs_free_path(log_path);
2521 iput(dir);
2522 return ret;
2523}
2524
2525/*
2526 * the process_func used to replay items from the log tree. This
2527 * gets called in two different stages. The first stage just looks
2528 * for inodes and makes sure they are all copied into the subvolume.
2529 *
2530 * The second stage copies all the other item types from the log into
2531 * the subvolume. The two stage approach is slower, but gets rid of
2532 * lots of complexity around inodes referencing other inodes that exist
2533 * only in the log (references come from either directory items or inode
2534 * back refs).
2535 */
2536static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
Qu Wenruo581c1762018-03-29 09:08:11 +08002537 struct walk_control *wc, u64 gen, int level)
Chris Masone02119d2008-09-05 16:13:11 -04002538{
2539 int nritems;
2540 struct btrfs_path *path;
2541 struct btrfs_root *root = wc->replay_dest;
2542 struct btrfs_key key;
Chris Masone02119d2008-09-05 16:13:11 -04002543 int i;
2544 int ret;
2545
Qu Wenruo581c1762018-03-29 09:08:11 +08002546 ret = btrfs_read_buffer(eb, gen, level, NULL);
Tsutomu Itoh018642a2012-05-29 18:10:13 +09002547 if (ret)
2548 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04002549
2550 level = btrfs_header_level(eb);
2551
2552 if (level != 0)
2553 return 0;
2554
2555 path = btrfs_alloc_path();
Mark Fasheh1e5063d2011-07-12 10:46:06 -07002556 if (!path)
2557 return -ENOMEM;
Chris Masone02119d2008-09-05 16:13:11 -04002558
2559 nritems = btrfs_header_nritems(eb);
2560 for (i = 0; i < nritems; i++) {
2561 btrfs_item_key_to_cpu(eb, &key, i);
Chris Masone02119d2008-09-05 16:13:11 -04002562
2563 /* inode keys are done during the first stage */
2564 if (key.type == BTRFS_INODE_ITEM_KEY &&
2565 wc->stage == LOG_WALK_REPLAY_INODES) {
Chris Masone02119d2008-09-05 16:13:11 -04002566 struct btrfs_inode_item *inode_item;
2567 u32 mode;
2568
2569 inode_item = btrfs_item_ptr(eb, i,
2570 struct btrfs_inode_item);
Filipe Mananaf2d72f42018-10-08 11:12:55 +01002571 /*
2572 * If we have a tmpfile (O_TMPFILE) that got fsync'ed
2573 * and never got linked before the fsync, skip it, as
2574 * replaying it is pointless since it would be deleted
2575 * later. We skip logging tmpfiles, but it's always
2576 * possible we are replaying a log created with a kernel
2577 * that used to log tmpfiles.
2578 */
2579 if (btrfs_inode_nlink(eb, inode_item) == 0) {
2580 wc->ignore_cur_inode = true;
2581 continue;
2582 } else {
2583 wc->ignore_cur_inode = false;
2584 }
Filipe Manana4f764e52015-02-23 19:53:35 +00002585 ret = replay_xattr_deletes(wc->trans, root, log,
2586 path, key.objectid);
2587 if (ret)
2588 break;
Chris Masone02119d2008-09-05 16:13:11 -04002589 mode = btrfs_inode_mode(eb, inode_item);
2590 if (S_ISDIR(mode)) {
2591 ret = replay_dir_deletes(wc->trans,
Chris Mason12fcfd22009-03-24 10:24:20 -04002592 root, log, path, key.objectid, 0);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002593 if (ret)
2594 break;
Chris Masone02119d2008-09-05 16:13:11 -04002595 }
2596 ret = overwrite_item(wc->trans, root, path,
2597 eb, i, &key);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002598 if (ret)
2599 break;
Chris Masone02119d2008-09-05 16:13:11 -04002600
Filipe Manana471d5572018-04-05 22:55:12 +01002601 /*
2602 * Before replaying extents, truncate the inode to its
2603 * size. We need to do it now and not after log replay
2604 * because before an fsync we can have prealloc extents
2605 * added beyond the inode's i_size. If we did it after,
2606 * through orphan cleanup for example, we would drop
2607 * those prealloc extents just after replaying them.
Chris Masone02119d2008-09-05 16:13:11 -04002608 */
2609 if (S_ISREG(mode)) {
Filipe Manana471d5572018-04-05 22:55:12 +01002610 struct inode *inode;
2611 u64 from;
2612
2613 inode = read_one_inode(root, key.objectid);
2614 if (!inode) {
2615 ret = -EIO;
2616 break;
2617 }
2618 from = ALIGN(i_size_read(inode),
2619 root->fs_info->sectorsize);
2620 ret = btrfs_drop_extents(wc->trans, root, inode,
2621 from, (u64)-1, 1);
Filipe Manana471d5572018-04-05 22:55:12 +01002622 if (!ret) {
Filipe Mananaf2d72f42018-10-08 11:12:55 +01002623 /* Update the inode's nbytes. */
Filipe Manana471d5572018-04-05 22:55:12 +01002624 ret = btrfs_update_inode(wc->trans,
2625 root, inode);
2626 }
2627 iput(inode);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002628 if (ret)
2629 break;
Chris Masone02119d2008-09-05 16:13:11 -04002630 }
Yan, Zhengc71bf092009-11-12 09:34:40 +00002631
Chris Masone02119d2008-09-05 16:13:11 -04002632 ret = link_to_fixup_dir(wc->trans, root,
2633 path, key.objectid);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002634 if (ret)
2635 break;
Chris Masone02119d2008-09-05 16:13:11 -04002636 }
Josef Bacikdd8e7212013-09-11 11:57:23 -04002637
Filipe Mananaf2d72f42018-10-08 11:12:55 +01002638 if (wc->ignore_cur_inode)
2639 continue;
2640
Josef Bacikdd8e7212013-09-11 11:57:23 -04002641 if (key.type == BTRFS_DIR_INDEX_KEY &&
2642 wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
2643 ret = replay_one_dir_item(wc->trans, root, path,
2644 eb, i, &key);
2645 if (ret)
2646 break;
2647 }
2648
Chris Masone02119d2008-09-05 16:13:11 -04002649 if (wc->stage < LOG_WALK_REPLAY_ALL)
2650 continue;
2651
2652 /* these keys are simply copied */
2653 if (key.type == BTRFS_XATTR_ITEM_KEY) {
2654 ret = overwrite_item(wc->trans, root, path,
2655 eb, i, &key);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002656 if (ret)
2657 break;
Liu Bo2da1c662013-05-26 13:50:29 +00002658 } else if (key.type == BTRFS_INODE_REF_KEY ||
2659 key.type == BTRFS_INODE_EXTREF_KEY) {
Mark Fashehf1863732012-08-08 11:32:27 -07002660 ret = add_inode_ref(wc->trans, root, log, path,
2661 eb, i, &key);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002662 if (ret && ret != -ENOENT)
2663 break;
2664 ret = 0;
Chris Masone02119d2008-09-05 16:13:11 -04002665 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
2666 ret = replay_one_extent(wc->trans, root, path,
2667 eb, i, &key);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002668 if (ret)
2669 break;
Josef Bacikdd8e7212013-09-11 11:57:23 -04002670 } else if (key.type == BTRFS_DIR_ITEM_KEY) {
Chris Masone02119d2008-09-05 16:13:11 -04002671 ret = replay_one_dir_item(wc->trans, root, path,
2672 eb, i, &key);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002673 if (ret)
2674 break;
Chris Masone02119d2008-09-05 16:13:11 -04002675 }
2676 }
2677 btrfs_free_path(path);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002678 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04002679}
2680
Chris Masond3977122009-01-05 21:25:51 -05002681static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
Chris Masone02119d2008-09-05 16:13:11 -04002682 struct btrfs_root *root,
2683 struct btrfs_path *path, int *level,
2684 struct walk_control *wc)
2685{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002686 struct btrfs_fs_info *fs_info = root->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -04002687 u64 root_owner;
Chris Masone02119d2008-09-05 16:13:11 -04002688 u64 bytenr;
2689 u64 ptr_gen;
2690 struct extent_buffer *next;
2691 struct extent_buffer *cur;
2692 struct extent_buffer *parent;
2693 u32 blocksize;
2694 int ret = 0;
2695
2696 WARN_ON(*level < 0);
2697 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2698
Chris Masond3977122009-01-05 21:25:51 -05002699 while (*level > 0) {
Qu Wenruo581c1762018-03-29 09:08:11 +08002700 struct btrfs_key first_key;
2701
Chris Masone02119d2008-09-05 16:13:11 -04002702 WARN_ON(*level < 0);
2703 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2704 cur = path->nodes[*level];
2705
Dulshani Gunawardhanafae7f212013-10-31 10:30:08 +05302706 WARN_ON(btrfs_header_level(cur) != *level);
Chris Masone02119d2008-09-05 16:13:11 -04002707
2708 if (path->slots[*level] >=
2709 btrfs_header_nritems(cur))
2710 break;
2711
2712 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
2713 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
Qu Wenruo581c1762018-03-29 09:08:11 +08002714 btrfs_node_key_to_cpu(cur, &first_key, path->slots[*level]);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002715 blocksize = fs_info->nodesize;
Chris Masone02119d2008-09-05 16:13:11 -04002716
2717 parent = path->nodes[*level];
2718 root_owner = btrfs_header_owner(parent);
Chris Masone02119d2008-09-05 16:13:11 -04002719
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002720 next = btrfs_find_create_tree_block(fs_info, bytenr);
Liu Boc871b0f2016-06-06 12:01:23 -07002721 if (IS_ERR(next))
2722 return PTR_ERR(next);
Chris Masone02119d2008-09-05 16:13:11 -04002723
Chris Masone02119d2008-09-05 16:13:11 -04002724 if (*level == 1) {
Qu Wenruo581c1762018-03-29 09:08:11 +08002725 ret = wc->process_func(root, next, wc, ptr_gen,
2726 *level - 1);
Josef Bacikb50c6e22013-04-25 15:55:30 -04002727 if (ret) {
2728 free_extent_buffer(next);
Mark Fasheh1e5063d2011-07-12 10:46:06 -07002729 return ret;
Josef Bacikb50c6e22013-04-25 15:55:30 -04002730 }
Yan, Zheng4a500fd2010-05-16 10:49:59 -04002731
Chris Masone02119d2008-09-05 16:13:11 -04002732 path->slots[*level]++;
2733 if (wc->free) {
Qu Wenruo581c1762018-03-29 09:08:11 +08002734 ret = btrfs_read_buffer(next, ptr_gen,
2735 *level - 1, &first_key);
Tsutomu Itoh018642a2012-05-29 18:10:13 +09002736 if (ret) {
2737 free_extent_buffer(next);
2738 return ret;
2739 }
Chris Masone02119d2008-09-05 16:13:11 -04002740
Josef Bacik681ae502013-10-07 15:11:00 -04002741 if (trans) {
2742 btrfs_tree_lock(next);
David Sterba8bead252018-04-04 02:03:48 +02002743 btrfs_set_lock_blocking_write(next);
David Sterba6a884d7d2019-03-20 14:30:02 +01002744 btrfs_clean_tree_block(next);
Josef Bacik681ae502013-10-07 15:11:00 -04002745 btrfs_wait_tree_block_writeback(next);
2746 btrfs_tree_unlock(next);
Liu Bo18464302018-01-25 11:02:51 -07002747 } else {
2748 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2749 clear_extent_buffer_dirty(next);
Josef Bacik681ae502013-10-07 15:11:00 -04002750 }
Chris Masone02119d2008-09-05 16:13:11 -04002751
Chris Masone02119d2008-09-05 16:13:11 -04002752 WARN_ON(root_owner !=
2753 BTRFS_TREE_LOG_OBJECTID);
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002754 ret = btrfs_free_and_pin_reserved_extent(
2755 fs_info, bytenr,
2756 blocksize);
Josef Bacik36508602013-04-25 16:23:32 -04002757 if (ret) {
2758 free_extent_buffer(next);
2759 return ret;
2760 }
Chris Masone02119d2008-09-05 16:13:11 -04002761 }
2762 free_extent_buffer(next);
2763 continue;
2764 }
Qu Wenruo581c1762018-03-29 09:08:11 +08002765 ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
Tsutomu Itoh018642a2012-05-29 18:10:13 +09002766 if (ret) {
2767 free_extent_buffer(next);
2768 return ret;
2769 }
Chris Masone02119d2008-09-05 16:13:11 -04002770
2771 WARN_ON(*level <= 0);
2772 if (path->nodes[*level-1])
2773 free_extent_buffer(path->nodes[*level-1]);
2774 path->nodes[*level-1] = next;
2775 *level = btrfs_header_level(next);
2776 path->slots[*level] = 0;
2777 cond_resched();
2778 }
2779 WARN_ON(*level < 0);
2780 WARN_ON(*level >= BTRFS_MAX_LEVEL);
2781
Yan, Zheng4a500fd2010-05-16 10:49:59 -04002782 path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
Chris Masone02119d2008-09-05 16:13:11 -04002783
2784 cond_resched();
2785 return 0;
2786}
2787
Chris Masond3977122009-01-05 21:25:51 -05002788static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
Chris Masone02119d2008-09-05 16:13:11 -04002789 struct btrfs_root *root,
2790 struct btrfs_path *path, int *level,
2791 struct walk_control *wc)
2792{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002793 struct btrfs_fs_info *fs_info = root->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -04002794 u64 root_owner;
Chris Masone02119d2008-09-05 16:13:11 -04002795 int i;
2796 int slot;
2797 int ret;
2798
Chris Masond3977122009-01-05 21:25:51 -05002799 for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
Chris Masone02119d2008-09-05 16:13:11 -04002800 slot = path->slots[i];
Yan, Zheng4a500fd2010-05-16 10:49:59 -04002801 if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
Chris Masone02119d2008-09-05 16:13:11 -04002802 path->slots[i]++;
2803 *level = i;
2804 WARN_ON(*level == 0);
2805 return 0;
2806 } else {
Zheng Yan31840ae2008-09-23 13:14:14 -04002807 struct extent_buffer *parent;
2808 if (path->nodes[*level] == root->node)
2809 parent = path->nodes[*level];
2810 else
2811 parent = path->nodes[*level + 1];
2812
2813 root_owner = btrfs_header_owner(parent);
Mark Fasheh1e5063d2011-07-12 10:46:06 -07002814 ret = wc->process_func(root, path->nodes[*level], wc,
Qu Wenruo581c1762018-03-29 09:08:11 +08002815 btrfs_header_generation(path->nodes[*level]),
2816 *level);
Mark Fasheh1e5063d2011-07-12 10:46:06 -07002817 if (ret)
2818 return ret;
2819
Chris Masone02119d2008-09-05 16:13:11 -04002820 if (wc->free) {
2821 struct extent_buffer *next;
2822
2823 next = path->nodes[*level];
2824
Josef Bacik681ae502013-10-07 15:11:00 -04002825 if (trans) {
2826 btrfs_tree_lock(next);
David Sterba8bead252018-04-04 02:03:48 +02002827 btrfs_set_lock_blocking_write(next);
David Sterba6a884d7d2019-03-20 14:30:02 +01002828 btrfs_clean_tree_block(next);
Josef Bacik681ae502013-10-07 15:11:00 -04002829 btrfs_wait_tree_block_writeback(next);
2830 btrfs_tree_unlock(next);
Liu Bo18464302018-01-25 11:02:51 -07002831 } else {
2832 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2833 clear_extent_buffer_dirty(next);
Josef Bacik681ae502013-10-07 15:11:00 -04002834 }
Chris Masone02119d2008-09-05 16:13:11 -04002835
Chris Masone02119d2008-09-05 16:13:11 -04002836 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002837 ret = btrfs_free_and_pin_reserved_extent(
2838 fs_info,
Chris Masone02119d2008-09-05 16:13:11 -04002839 path->nodes[*level]->start,
Chris Masond00aff02008-09-11 15:54:42 -04002840 path->nodes[*level]->len);
Josef Bacik36508602013-04-25 16:23:32 -04002841 if (ret)
2842 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04002843 }
2844 free_extent_buffer(path->nodes[*level]);
2845 path->nodes[*level] = NULL;
2846 *level = i + 1;
2847 }
2848 }
2849 return 1;
2850}
2851
2852/*
2853 * drop the reference count on the tree rooted at 'snap'. This traverses
2854 * the tree freeing any blocks that have a ref count of zero after being
2855 * decremented.
2856 */
2857static int walk_log_tree(struct btrfs_trans_handle *trans,
2858 struct btrfs_root *log, struct walk_control *wc)
2859{
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002860 struct btrfs_fs_info *fs_info = log->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -04002861 int ret = 0;
2862 int wret;
2863 int level;
2864 struct btrfs_path *path;
Chris Masone02119d2008-09-05 16:13:11 -04002865 int orig_level;
2866
2867 path = btrfs_alloc_path();
Tsutomu Itohdb5b4932011-03-23 08:14:16 +00002868 if (!path)
2869 return -ENOMEM;
Chris Masone02119d2008-09-05 16:13:11 -04002870
2871 level = btrfs_header_level(log->node);
2872 orig_level = level;
2873 path->nodes[level] = log->node;
2874 extent_buffer_get(log->node);
2875 path->slots[level] = 0;
2876
Chris Masond3977122009-01-05 21:25:51 -05002877 while (1) {
Chris Masone02119d2008-09-05 16:13:11 -04002878 wret = walk_down_log_tree(trans, log, path, &level, wc);
2879 if (wret > 0)
2880 break;
Jeff Mahoney79787ea2012-03-12 16:03:00 +01002881 if (wret < 0) {
Chris Masone02119d2008-09-05 16:13:11 -04002882 ret = wret;
Jeff Mahoney79787ea2012-03-12 16:03:00 +01002883 goto out;
2884 }
Chris Masone02119d2008-09-05 16:13:11 -04002885
2886 wret = walk_up_log_tree(trans, log, path, &level, wc);
2887 if (wret > 0)
2888 break;
Jeff Mahoney79787ea2012-03-12 16:03:00 +01002889 if (wret < 0) {
Chris Masone02119d2008-09-05 16:13:11 -04002890 ret = wret;
Jeff Mahoney79787ea2012-03-12 16:03:00 +01002891 goto out;
2892 }
Chris Masone02119d2008-09-05 16:13:11 -04002893 }
2894
2895 /* was the root node processed? if not, catch it here */
2896 if (path->nodes[orig_level]) {
Jeff Mahoney79787ea2012-03-12 16:03:00 +01002897 ret = wc->process_func(log, path->nodes[orig_level], wc,
Qu Wenruo581c1762018-03-29 09:08:11 +08002898 btrfs_header_generation(path->nodes[orig_level]),
2899 orig_level);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01002900 if (ret)
2901 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04002902 if (wc->free) {
2903 struct extent_buffer *next;
2904
2905 next = path->nodes[orig_level];
2906
Josef Bacik681ae502013-10-07 15:11:00 -04002907 if (trans) {
2908 btrfs_tree_lock(next);
David Sterba8bead252018-04-04 02:03:48 +02002909 btrfs_set_lock_blocking_write(next);
David Sterba6a884d7d2019-03-20 14:30:02 +01002910 btrfs_clean_tree_block(next);
Josef Bacik681ae502013-10-07 15:11:00 -04002911 btrfs_wait_tree_block_writeback(next);
2912 btrfs_tree_unlock(next);
Liu Bo18464302018-01-25 11:02:51 -07002913 } else {
2914 if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags))
2915 clear_extent_buffer_dirty(next);
Josef Bacik681ae502013-10-07 15:11:00 -04002916 }
Chris Masone02119d2008-09-05 16:13:11 -04002917
Chris Masone02119d2008-09-05 16:13:11 -04002918 WARN_ON(log->root_key.objectid !=
2919 BTRFS_TREE_LOG_OBJECTID);
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04002920 ret = btrfs_free_and_pin_reserved_extent(fs_info,
2921 next->start, next->len);
Josef Bacik36508602013-04-25 16:23:32 -04002922 if (ret)
2923 goto out;
Chris Masone02119d2008-09-05 16:13:11 -04002924 }
2925 }
2926
Jeff Mahoney79787ea2012-03-12 16:03:00 +01002927out:
Chris Masone02119d2008-09-05 16:13:11 -04002928 btrfs_free_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04002929 return ret;
2930}
2931
Yan Zheng7237f182009-01-21 12:54:03 -05002932/*
2933 * helper function to update the item for a given subvolumes log root
2934 * in the tree of log roots
2935 */
2936static int update_log_root(struct btrfs_trans_handle *trans,
2937 struct btrfs_root *log)
2938{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002939 struct btrfs_fs_info *fs_info = log->fs_info;
Yan Zheng7237f182009-01-21 12:54:03 -05002940 int ret;
2941
2942 if (log->log_transid == 1) {
2943 /* insert root item on the first sync */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002944 ret = btrfs_insert_root(trans, fs_info->log_root_tree,
Yan Zheng7237f182009-01-21 12:54:03 -05002945 &log->root_key, &log->root_item);
2946 } else {
Jeff Mahoney0b246af2016-06-22 18:54:23 -04002947 ret = btrfs_update_root(trans, fs_info->log_root_tree,
Yan Zheng7237f182009-01-21 12:54:03 -05002948 &log->root_key, &log->root_item);
2949 }
2950 return ret;
2951}
2952
Zhaolei60d53eb2015-08-17 18:44:46 +08002953static void wait_log_commit(struct btrfs_root *root, int transid)
Chris Masone02119d2008-09-05 16:13:11 -04002954{
2955 DEFINE_WAIT(wait);
Yan Zheng7237f182009-01-21 12:54:03 -05002956 int index = transid % 2;
Chris Masone02119d2008-09-05 16:13:11 -04002957
Yan Zheng7237f182009-01-21 12:54:03 -05002958 /*
2959 * we only allow two pending log transactions at a time,
2960 * so we know that if ours is more than 2 older than the
2961 * current transaction, we're done
2962 */
Liu Bo49e83f52017-09-01 16:14:30 -06002963 for (;;) {
Yan Zheng7237f182009-01-21 12:54:03 -05002964 prepare_to_wait(&root->log_commit_wait[index],
2965 &wait, TASK_UNINTERRUPTIBLE);
Liu Bo49e83f52017-09-01 16:14:30 -06002966
2967 if (!(root->log_transid_committed < transid &&
2968 atomic_read(&root->log_commit[index])))
2969 break;
2970
Yan Zheng7237f182009-01-21 12:54:03 -05002971 mutex_unlock(&root->log_mutex);
Liu Bo49e83f52017-09-01 16:14:30 -06002972 schedule();
Yan Zheng7237f182009-01-21 12:54:03 -05002973 mutex_lock(&root->log_mutex);
Liu Bo49e83f52017-09-01 16:14:30 -06002974 }
2975 finish_wait(&root->log_commit_wait[index], &wait);
Yan Zheng7237f182009-01-21 12:54:03 -05002976}
2977
Zhaolei60d53eb2015-08-17 18:44:46 +08002978static void wait_for_writer(struct btrfs_root *root)
Yan Zheng7237f182009-01-21 12:54:03 -05002979{
2980 DEFINE_WAIT(wait);
Miao Xie8b050d32014-02-20 18:08:58 +08002981
Liu Bo49e83f52017-09-01 16:14:30 -06002982 for (;;) {
2983 prepare_to_wait(&root->log_writer_wait, &wait,
2984 TASK_UNINTERRUPTIBLE);
2985 if (!atomic_read(&root->log_writers))
2986 break;
2987
Yan Zheng7237f182009-01-21 12:54:03 -05002988 mutex_unlock(&root->log_mutex);
Liu Bo49e83f52017-09-01 16:14:30 -06002989 schedule();
Filipe Manana575849e2015-02-11 11:12:39 +00002990 mutex_lock(&root->log_mutex);
Yan Zheng7237f182009-01-21 12:54:03 -05002991 }
Liu Bo49e83f52017-09-01 16:14:30 -06002992 finish_wait(&root->log_writer_wait, &wait);
Chris Masone02119d2008-09-05 16:13:11 -04002993}
2994
Miao Xie8b050d32014-02-20 18:08:58 +08002995static inline void btrfs_remove_log_ctx(struct btrfs_root *root,
2996 struct btrfs_log_ctx *ctx)
2997{
2998 if (!ctx)
2999 return;
3000
3001 mutex_lock(&root->log_mutex);
3002 list_del_init(&ctx->list);
3003 mutex_unlock(&root->log_mutex);
3004}
3005
3006/*
3007 * Invoked in log mutex context, or be sure there is no other task which
3008 * can access the list.
3009 */
3010static inline void btrfs_remove_all_log_ctxs(struct btrfs_root *root,
3011 int index, int error)
3012{
3013 struct btrfs_log_ctx *ctx;
Chris Mason570dd452016-10-27 10:42:20 -07003014 struct btrfs_log_ctx *safe;
Miao Xie8b050d32014-02-20 18:08:58 +08003015
Chris Mason570dd452016-10-27 10:42:20 -07003016 list_for_each_entry_safe(ctx, safe, &root->log_ctxs[index], list) {
3017 list_del_init(&ctx->list);
Miao Xie8b050d32014-02-20 18:08:58 +08003018 ctx->log_ret = error;
Chris Mason570dd452016-10-27 10:42:20 -07003019 }
Miao Xie8b050d32014-02-20 18:08:58 +08003020
3021 INIT_LIST_HEAD(&root->log_ctxs[index]);
3022}
3023
Chris Masone02119d2008-09-05 16:13:11 -04003024/*
3025 * btrfs_sync_log does sends a given tree log down to the disk and
3026 * updates the super blocks to record it. When this call is done,
Chris Mason12fcfd22009-03-24 10:24:20 -04003027 * you know that any inodes previously logged are safely on disk only
3028 * if it returns 0.
3029 *
3030 * Any other return value means you need to call btrfs_commit_transaction.
3031 * Some of the edge cases for fsyncing directories that have had unlinks
3032 * or renames done in the past mean that sometimes the only safe
3033 * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
3034 * that has happened.
Chris Masone02119d2008-09-05 16:13:11 -04003035 */
3036int btrfs_sync_log(struct btrfs_trans_handle *trans,
Miao Xie8b050d32014-02-20 18:08:58 +08003037 struct btrfs_root *root, struct btrfs_log_ctx *ctx)
Chris Masone02119d2008-09-05 16:13:11 -04003038{
Yan Zheng7237f182009-01-21 12:54:03 -05003039 int index1;
3040 int index2;
Yan, Zheng8cef4e12009-11-12 09:33:26 +00003041 int mark;
Chris Masone02119d2008-09-05 16:13:11 -04003042 int ret;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003043 struct btrfs_fs_info *fs_info = root->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -04003044 struct btrfs_root *log = root->log_root;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003045 struct btrfs_root *log_root_tree = fs_info->log_root_tree;
Miao Xiebb14a592014-02-20 18:08:56 +08003046 int log_transid = 0;
Miao Xie8b050d32014-02-20 18:08:58 +08003047 struct btrfs_log_ctx root_log_ctx;
Miao Xiec6adc9c2013-05-28 10:05:39 +00003048 struct blk_plug plug;
Chris Masone02119d2008-09-05 16:13:11 -04003049
Yan Zheng7237f182009-01-21 12:54:03 -05003050 mutex_lock(&root->log_mutex);
Miao Xied1433de2014-02-20 18:08:59 +08003051 log_transid = ctx->log_transid;
3052 if (root->log_transid_committed >= log_transid) {
Yan Zheng7237f182009-01-21 12:54:03 -05003053 mutex_unlock(&root->log_mutex);
Miao Xie8b050d32014-02-20 18:08:58 +08003054 return ctx->log_ret;
Chris Masone02119d2008-09-05 16:13:11 -04003055 }
Miao Xied1433de2014-02-20 18:08:59 +08003056
3057 index1 = log_transid % 2;
3058 if (atomic_read(&root->log_commit[index1])) {
Zhaolei60d53eb2015-08-17 18:44:46 +08003059 wait_log_commit(root, log_transid);
Miao Xied1433de2014-02-20 18:08:59 +08003060 mutex_unlock(&root->log_mutex);
3061 return ctx->log_ret;
3062 }
3063 ASSERT(log_transid == root->log_transid);
Yan Zheng7237f182009-01-21 12:54:03 -05003064 atomic_set(&root->log_commit[index1], 1);
3065
3066 /* wait for previous tree log sync to complete */
3067 if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
Zhaolei60d53eb2015-08-17 18:44:46 +08003068 wait_log_commit(root, log_transid - 1);
Miao Xie48cab2e2014-02-20 18:08:52 +08003069
Yan, Zheng86df7eb2009-10-14 09:24:59 -04003070 while (1) {
Miao Xie2ecb7922012-09-06 04:04:27 -06003071 int batch = atomic_read(&root->log_batch);
Chris Masoncd354ad2011-10-20 15:45:37 -04003072 /* when we're on an ssd, just kick the log commit out */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003073 if (!btrfs_test_opt(fs_info, SSD) &&
Miao Xie27cdeb72014-04-02 19:51:05 +08003074 test_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state)) {
Yan, Zheng86df7eb2009-10-14 09:24:59 -04003075 mutex_unlock(&root->log_mutex);
3076 schedule_timeout_uninterruptible(1);
3077 mutex_lock(&root->log_mutex);
3078 }
Zhaolei60d53eb2015-08-17 18:44:46 +08003079 wait_for_writer(root);
Miao Xie2ecb7922012-09-06 04:04:27 -06003080 if (batch == atomic_read(&root->log_batch))
Chris Masone02119d2008-09-05 16:13:11 -04003081 break;
3082 }
Chris Masond0c803c2008-09-11 16:17:57 -04003083
Chris Mason12fcfd22009-03-24 10:24:20 -04003084 /* bail out if we need to do a full commit */
David Sterba4884b8e2019-03-20 13:25:34 +01003085 if (btrfs_need_log_full_commit(trans)) {
Chris Mason12fcfd22009-03-24 10:24:20 -04003086 ret = -EAGAIN;
3087 mutex_unlock(&root->log_mutex);
3088 goto out;
3089 }
3090
Yan, Zheng8cef4e12009-11-12 09:33:26 +00003091 if (log_transid % 2 == 0)
3092 mark = EXTENT_DIRTY;
3093 else
3094 mark = EXTENT_NEW;
3095
Chris Mason690587d2009-10-13 13:29:19 -04003096 /* we start IO on all the marked extents here, but we don't actually
3097 * wait for them until later.
3098 */
Miao Xiec6adc9c2013-05-28 10:05:39 +00003099 blk_start_plug(&plug);
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04003100 ret = btrfs_write_marked_extents(fs_info, &log->dirty_log_pages, mark);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003101 if (ret) {
Miao Xiec6adc9c2013-05-28 10:05:39 +00003102 blk_finish_plug(&plug);
Jeff Mahoney66642832016-06-10 18:19:25 -04003103 btrfs_abort_transaction(trans, ret);
David Sterba90787762019-03-20 13:28:05 +01003104 btrfs_set_log_full_commit(trans);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003105 mutex_unlock(&root->log_mutex);
3106 goto out;
3107 }
Yan Zheng7237f182009-01-21 12:54:03 -05003108
Yan Zheng5d4f98a2009-06-10 10:45:14 -04003109 btrfs_set_root_node(&log->root_item, log->node);
Yan Zheng7237f182009-01-21 12:54:03 -05003110
Yan Zheng7237f182009-01-21 12:54:03 -05003111 root->log_transid++;
3112 log->log_transid = root->log_transid;
Josef Bacikff782e02009-10-08 15:30:04 -04003113 root->log_start_pid = 0;
Yan Zheng7237f182009-01-21 12:54:03 -05003114 /*
Yan, Zheng8cef4e12009-11-12 09:33:26 +00003115 * IO has been started, blocks of the log tree have WRITTEN flag set
3116 * in their headers. new modifications of the log will be written to
3117 * new positions. so it's safe to allow log writers to go in.
Yan Zheng7237f182009-01-21 12:54:03 -05003118 */
3119 mutex_unlock(&root->log_mutex);
3120
Filipe Manana28a23592016-08-23 21:13:51 +01003121 btrfs_init_log_ctx(&root_log_ctx, NULL);
Miao Xied1433de2014-02-20 18:08:59 +08003122
Yan Zheng7237f182009-01-21 12:54:03 -05003123 mutex_lock(&log_root_tree->log_mutex);
Miao Xie2ecb7922012-09-06 04:04:27 -06003124 atomic_inc(&log_root_tree->log_batch);
Yan Zheng7237f182009-01-21 12:54:03 -05003125 atomic_inc(&log_root_tree->log_writers);
Miao Xied1433de2014-02-20 18:08:59 +08003126
3127 index2 = log_root_tree->log_transid % 2;
3128 list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
3129 root_log_ctx.log_transid = log_root_tree->log_transid;
3130
Yan Zheng7237f182009-01-21 12:54:03 -05003131 mutex_unlock(&log_root_tree->log_mutex);
3132
3133 ret = update_log_root(trans, log);
Yan Zheng7237f182009-01-21 12:54:03 -05003134
3135 mutex_lock(&log_root_tree->log_mutex);
3136 if (atomic_dec_and_test(&log_root_tree->log_writers)) {
David Sterba093258e2018-02-26 16:15:17 +01003137 /* atomic_dec_and_test implies a barrier */
3138 cond_wake_up_nomb(&log_root_tree->log_writer_wait);
Yan Zheng7237f182009-01-21 12:54:03 -05003139 }
3140
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003141 if (ret) {
Miao Xied1433de2014-02-20 18:08:59 +08003142 if (!list_empty(&root_log_ctx.list))
3143 list_del_init(&root_log_ctx.list);
3144
Miao Xiec6adc9c2013-05-28 10:05:39 +00003145 blk_finish_plug(&plug);
David Sterba90787762019-03-20 13:28:05 +01003146 btrfs_set_log_full_commit(trans);
Miao Xie995946d2014-04-02 19:51:06 +08003147
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003148 if (ret != -ENOSPC) {
Jeff Mahoney66642832016-06-10 18:19:25 -04003149 btrfs_abort_transaction(trans, ret);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003150 mutex_unlock(&log_root_tree->log_mutex);
3151 goto out;
3152 }
Jeff Mahoneybf89d382016-09-09 20:42:44 -04003153 btrfs_wait_tree_log_extents(log, mark);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003154 mutex_unlock(&log_root_tree->log_mutex);
3155 ret = -EAGAIN;
3156 goto out;
3157 }
3158
Miao Xied1433de2014-02-20 18:08:59 +08003159 if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
Forrest Liu3da5ab52015-01-30 19:42:12 +08003160 blk_finish_plug(&plug);
Chris Masoncbd60aa2016-09-06 05:37:40 -07003161 list_del_init(&root_log_ctx.list);
Miao Xied1433de2014-02-20 18:08:59 +08003162 mutex_unlock(&log_root_tree->log_mutex);
3163 ret = root_log_ctx.log_ret;
3164 goto out;
3165 }
Miao Xie8b050d32014-02-20 18:08:58 +08003166
Miao Xied1433de2014-02-20 18:08:59 +08003167 index2 = root_log_ctx.log_transid % 2;
Yan Zheng7237f182009-01-21 12:54:03 -05003168 if (atomic_read(&log_root_tree->log_commit[index2])) {
Miao Xiec6adc9c2013-05-28 10:05:39 +00003169 blk_finish_plug(&plug);
Jeff Mahoneybf89d382016-09-09 20:42:44 -04003170 ret = btrfs_wait_tree_log_extents(log, mark);
Zhaolei60d53eb2015-08-17 18:44:46 +08003171 wait_log_commit(log_root_tree,
Miao Xied1433de2014-02-20 18:08:59 +08003172 root_log_ctx.log_transid);
Yan Zheng7237f182009-01-21 12:54:03 -05003173 mutex_unlock(&log_root_tree->log_mutex);
Filipe Manana5ab5e442014-11-13 16:59:53 +00003174 if (!ret)
3175 ret = root_log_ctx.log_ret;
Yan Zheng7237f182009-01-21 12:54:03 -05003176 goto out;
3177 }
Miao Xied1433de2014-02-20 18:08:59 +08003178 ASSERT(root_log_ctx.log_transid == log_root_tree->log_transid);
Yan Zheng7237f182009-01-21 12:54:03 -05003179 atomic_set(&log_root_tree->log_commit[index2], 1);
3180
Chris Mason12fcfd22009-03-24 10:24:20 -04003181 if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
Zhaolei60d53eb2015-08-17 18:44:46 +08003182 wait_log_commit(log_root_tree,
Miao Xied1433de2014-02-20 18:08:59 +08003183 root_log_ctx.log_transid - 1);
Chris Mason12fcfd22009-03-24 10:24:20 -04003184 }
Yan Zheng7237f182009-01-21 12:54:03 -05003185
Zhaolei60d53eb2015-08-17 18:44:46 +08003186 wait_for_writer(log_root_tree);
Chris Mason12fcfd22009-03-24 10:24:20 -04003187
3188 /*
3189 * now that we've moved on to the tree of log tree roots,
3190 * check the full commit flag again
3191 */
David Sterba4884b8e2019-03-20 13:25:34 +01003192 if (btrfs_need_log_full_commit(trans)) {
Miao Xiec6adc9c2013-05-28 10:05:39 +00003193 blk_finish_plug(&plug);
Jeff Mahoneybf89d382016-09-09 20:42:44 -04003194 btrfs_wait_tree_log_extents(log, mark);
Chris Mason12fcfd22009-03-24 10:24:20 -04003195 mutex_unlock(&log_root_tree->log_mutex);
3196 ret = -EAGAIN;
3197 goto out_wake_log_root;
3198 }
Yan Zheng7237f182009-01-21 12:54:03 -05003199
Jeff Mahoney2ff7e612016-06-22 18:54:24 -04003200 ret = btrfs_write_marked_extents(fs_info,
Miao Xiec6adc9c2013-05-28 10:05:39 +00003201 &log_root_tree->dirty_log_pages,
3202 EXTENT_DIRTY | EXTENT_NEW);
3203 blk_finish_plug(&plug);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003204 if (ret) {
David Sterba90787762019-03-20 13:28:05 +01003205 btrfs_set_log_full_commit(trans);
Jeff Mahoney66642832016-06-10 18:19:25 -04003206 btrfs_abort_transaction(trans, ret);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003207 mutex_unlock(&log_root_tree->log_mutex);
3208 goto out_wake_log_root;
3209 }
Jeff Mahoneybf89d382016-09-09 20:42:44 -04003210 ret = btrfs_wait_tree_log_extents(log, mark);
Filipe Manana5ab5e442014-11-13 16:59:53 +00003211 if (!ret)
Jeff Mahoneybf89d382016-09-09 20:42:44 -04003212 ret = btrfs_wait_tree_log_extents(log_root_tree,
3213 EXTENT_NEW | EXTENT_DIRTY);
Filipe Manana5ab5e442014-11-13 16:59:53 +00003214 if (ret) {
David Sterba90787762019-03-20 13:28:05 +01003215 btrfs_set_log_full_commit(trans);
Filipe Manana5ab5e442014-11-13 16:59:53 +00003216 mutex_unlock(&log_root_tree->log_mutex);
3217 goto out_wake_log_root;
3218 }
Chris Masone02119d2008-09-05 16:13:11 -04003219
Jeff Mahoney0b246af2016-06-22 18:54:23 -04003220 btrfs_set_super_log_root(fs_info->super_for_commit,
3221 log_root_tree->node->start);
3222 btrfs_set_super_log_root_level(fs_info->super_for_commit,
3223 btrfs_header_level(log_root_tree->node));
Chris Masone02119d2008-09-05 16:13:11 -04003224
Yan Zheng7237f182009-01-21 12:54:03 -05003225 log_root_tree->log_transid++;
Yan Zheng7237f182009-01-21 12:54:03 -05003226 mutex_unlock(&log_root_tree->log_mutex);
3227
3228 /*
Andrea Gelmini52042d82018-11-28 12:05:13 +01003229 * Nobody else is going to jump in and write the ctree
Yan Zheng7237f182009-01-21 12:54:03 -05003230 * super here because the log_commit atomic below is protecting
3231 * us. We must be called with a transaction handle pinning
3232 * the running transaction open, so a full commit can't hop
3233 * in and cause problems either.
3234 */
David Sterbaeece6a92017-02-10 19:04:32 +01003235 ret = write_all_supers(fs_info, 1);
Stefan Behrens5af3e8c2012-08-01 18:56:49 +02003236 if (ret) {
David Sterba90787762019-03-20 13:28:05 +01003237 btrfs_set_log_full_commit(trans);
Jeff Mahoney66642832016-06-10 18:19:25 -04003238 btrfs_abort_transaction(trans, ret);
Stefan Behrens5af3e8c2012-08-01 18:56:49 +02003239 goto out_wake_log_root;
3240 }
Yan Zheng7237f182009-01-21 12:54:03 -05003241
Chris Mason257c62e2009-10-13 13:21:08 -04003242 mutex_lock(&root->log_mutex);
3243 if (root->last_log_commit < log_transid)
3244 root->last_log_commit = log_transid;
3245 mutex_unlock(&root->log_mutex);
3246
Chris Mason12fcfd22009-03-24 10:24:20 -04003247out_wake_log_root:
Chris Mason570dd452016-10-27 10:42:20 -07003248 mutex_lock(&log_root_tree->log_mutex);
Miao Xie8b050d32014-02-20 18:08:58 +08003249 btrfs_remove_all_log_ctxs(log_root_tree, index2, ret);
3250
Miao Xied1433de2014-02-20 18:08:59 +08003251 log_root_tree->log_transid_committed++;
Yan Zheng7237f182009-01-21 12:54:03 -05003252 atomic_set(&log_root_tree->log_commit[index2], 0);
Miao Xied1433de2014-02-20 18:08:59 +08003253 mutex_unlock(&log_root_tree->log_mutex);
3254
David Sterba33a9eca2015-10-10 18:35:10 +02003255 /*
David Sterba093258e2018-02-26 16:15:17 +01003256 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3257 * all the updates above are seen by the woken threads. It might not be
3258 * necessary, but proving that seems to be hard.
David Sterba33a9eca2015-10-10 18:35:10 +02003259 */
David Sterba093258e2018-02-26 16:15:17 +01003260 cond_wake_up(&log_root_tree->log_commit_wait[index2]);
Chris Masone02119d2008-09-05 16:13:11 -04003261out:
Miao Xied1433de2014-02-20 18:08:59 +08003262 mutex_lock(&root->log_mutex);
Chris Mason570dd452016-10-27 10:42:20 -07003263 btrfs_remove_all_log_ctxs(root, index1, ret);
Miao Xied1433de2014-02-20 18:08:59 +08003264 root->log_transid_committed++;
Yan Zheng7237f182009-01-21 12:54:03 -05003265 atomic_set(&root->log_commit[index1], 0);
Miao Xied1433de2014-02-20 18:08:59 +08003266 mutex_unlock(&root->log_mutex);
Miao Xie8b050d32014-02-20 18:08:58 +08003267
David Sterba33a9eca2015-10-10 18:35:10 +02003268 /*
David Sterba093258e2018-02-26 16:15:17 +01003269 * The barrier before waitqueue_active (in cond_wake_up) is needed so
3270 * all the updates above are seen by the woken threads. It might not be
3271 * necessary, but proving that seems to be hard.
David Sterba33a9eca2015-10-10 18:35:10 +02003272 */
David Sterba093258e2018-02-26 16:15:17 +01003273 cond_wake_up(&root->log_commit_wait[index1]);
Chris Masonb31eabd2011-01-31 16:48:24 -05003274 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04003275}
3276
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003277static void free_log_tree(struct btrfs_trans_handle *trans,
3278 struct btrfs_root *log)
Chris Masone02119d2008-09-05 16:13:11 -04003279{
3280 int ret;
Chris Masone02119d2008-09-05 16:13:11 -04003281 struct walk_control wc = {
3282 .free = 1,
3283 .process_func = process_one_buffer
3284 };
3285
Josef Bacik681ae502013-10-07 15:11:00 -04003286 ret = walk_log_tree(trans, log, &wc);
Jeff Mahoney374b0e22018-09-06 16:59:33 -04003287 if (ret) {
3288 if (trans)
3289 btrfs_abort_transaction(trans, ret);
3290 else
3291 btrfs_handle_fs_error(log->fs_info, ret, NULL);
3292 }
Chris Masone02119d2008-09-05 16:13:11 -04003293
Filipe Manana59b07132018-11-09 10:43:08 +00003294 clear_extent_bits(&log->dirty_log_pages, 0, (u64)-1,
3295 EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT);
Yan Zheng7237f182009-01-21 12:54:03 -05003296 free_extent_buffer(log->node);
3297 kfree(log);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003298}
3299
3300/*
3301 * free all the extents used by the tree log. This should be called
3302 * at commit time of the full transaction
3303 */
3304int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
3305{
3306 if (root->log_root) {
3307 free_log_tree(trans, root->log_root);
3308 root->log_root = NULL;
3309 }
3310 return 0;
3311}
3312
3313int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
3314 struct btrfs_fs_info *fs_info)
3315{
3316 if (fs_info->log_root_tree) {
3317 free_log_tree(trans, fs_info->log_root_tree);
3318 fs_info->log_root_tree = NULL;
3319 }
Chris Masone02119d2008-09-05 16:13:11 -04003320 return 0;
3321}
3322
3323/*
Chris Masone02119d2008-09-05 16:13:11 -04003324 * If both a file and directory are logged, and unlinks or renames are
3325 * mixed in, we have a few interesting corners:
3326 *
3327 * create file X in dir Y
3328 * link file X to X.link in dir Y
3329 * fsync file X
3330 * unlink file X but leave X.link
3331 * fsync dir Y
3332 *
3333 * After a crash we would expect only X.link to exist. But file X
3334 * didn't get fsync'd again so the log has back refs for X and X.link.
3335 *
3336 * We solve this by removing directory entries and inode backrefs from the
3337 * log when a file that was logged in the current transaction is
3338 * unlinked. Any later fsync will include the updated log entries, and
3339 * we'll be able to reconstruct the proper directory items from backrefs.
3340 *
3341 * This optimizations allows us to avoid relogging the entire inode
3342 * or the entire directory.
3343 */
3344int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
3345 struct btrfs_root *root,
3346 const char *name, int name_len,
Nikolay Borisov49f34d12017-01-18 00:31:32 +02003347 struct btrfs_inode *dir, u64 index)
Chris Masone02119d2008-09-05 16:13:11 -04003348{
3349 struct btrfs_root *log;
3350 struct btrfs_dir_item *di;
3351 struct btrfs_path *path;
3352 int ret;
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003353 int err = 0;
Chris Masone02119d2008-09-05 16:13:11 -04003354 int bytes_del = 0;
Nikolay Borisov49f34d12017-01-18 00:31:32 +02003355 u64 dir_ino = btrfs_ino(dir);
Chris Masone02119d2008-09-05 16:13:11 -04003356
Nikolay Borisov49f34d12017-01-18 00:31:32 +02003357 if (dir->logged_trans < trans->transid)
Chris Mason3a5f1d42008-09-11 15:53:37 -04003358 return 0;
3359
Chris Masone02119d2008-09-05 16:13:11 -04003360 ret = join_running_log_trans(root);
3361 if (ret)
3362 return 0;
3363
Nikolay Borisov49f34d12017-01-18 00:31:32 +02003364 mutex_lock(&dir->log_mutex);
Chris Masone02119d2008-09-05 16:13:11 -04003365
3366 log = root->log_root;
3367 path = btrfs_alloc_path();
Tsutomu Itoha62f44a2011-04-25 19:43:51 -04003368 if (!path) {
3369 err = -ENOMEM;
3370 goto out_unlock;
3371 }
liubo2a29edc2011-01-26 06:22:08 +00003372
Li Zefan33345d012011-04-20 10:31:50 +08003373 di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
Chris Masone02119d2008-09-05 16:13:11 -04003374 name, name_len, -1);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003375 if (IS_ERR(di)) {
3376 err = PTR_ERR(di);
3377 goto fail;
3378 }
3379 if (di) {
Chris Masone02119d2008-09-05 16:13:11 -04003380 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3381 bytes_del += name_len;
Josef Bacik36508602013-04-25 16:23:32 -04003382 if (ret) {
3383 err = ret;
3384 goto fail;
3385 }
Chris Masone02119d2008-09-05 16:13:11 -04003386 }
David Sterbab3b4aa72011-04-21 01:20:15 +02003387 btrfs_release_path(path);
Li Zefan33345d012011-04-20 10:31:50 +08003388 di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
Chris Masone02119d2008-09-05 16:13:11 -04003389 index, name, name_len, -1);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003390 if (IS_ERR(di)) {
3391 err = PTR_ERR(di);
3392 goto fail;
3393 }
3394 if (di) {
Chris Masone02119d2008-09-05 16:13:11 -04003395 ret = btrfs_delete_one_dir_name(trans, log, path, di);
3396 bytes_del += name_len;
Josef Bacik36508602013-04-25 16:23:32 -04003397 if (ret) {
3398 err = ret;
3399 goto fail;
3400 }
Chris Masone02119d2008-09-05 16:13:11 -04003401 }
3402
3403 /* update the directory size in the log to reflect the names
3404 * we have removed
3405 */
3406 if (bytes_del) {
3407 struct btrfs_key key;
3408
Li Zefan33345d012011-04-20 10:31:50 +08003409 key.objectid = dir_ino;
Chris Masone02119d2008-09-05 16:13:11 -04003410 key.offset = 0;
3411 key.type = BTRFS_INODE_ITEM_KEY;
David Sterbab3b4aa72011-04-21 01:20:15 +02003412 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04003413
3414 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003415 if (ret < 0) {
3416 err = ret;
3417 goto fail;
3418 }
Chris Masone02119d2008-09-05 16:13:11 -04003419 if (ret == 0) {
3420 struct btrfs_inode_item *item;
3421 u64 i_size;
3422
3423 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3424 struct btrfs_inode_item);
3425 i_size = btrfs_inode_size(path->nodes[0], item);
3426 if (i_size > bytes_del)
3427 i_size -= bytes_del;
3428 else
3429 i_size = 0;
3430 btrfs_set_inode_size(path->nodes[0], item, i_size);
3431 btrfs_mark_buffer_dirty(path->nodes[0]);
3432 } else
3433 ret = 0;
David Sterbab3b4aa72011-04-21 01:20:15 +02003434 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04003435 }
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003436fail:
Chris Masone02119d2008-09-05 16:13:11 -04003437 btrfs_free_path(path);
Tsutomu Itoha62f44a2011-04-25 19:43:51 -04003438out_unlock:
Nikolay Borisov49f34d12017-01-18 00:31:32 +02003439 mutex_unlock(&dir->log_mutex);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003440 if (ret == -ENOSPC) {
David Sterba90787762019-03-20 13:28:05 +01003441 btrfs_set_log_full_commit(trans);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003442 ret = 0;
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003443 } else if (ret < 0)
Jeff Mahoney66642832016-06-10 18:19:25 -04003444 btrfs_abort_transaction(trans, ret);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003445
Chris Mason12fcfd22009-03-24 10:24:20 -04003446 btrfs_end_log_trans(root);
Chris Masone02119d2008-09-05 16:13:11 -04003447
Andi Kleen411fc6b2010-10-29 15:14:31 -04003448 return err;
Chris Masone02119d2008-09-05 16:13:11 -04003449}
3450
3451/* see comments for btrfs_del_dir_entries_in_log */
3452int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
3453 struct btrfs_root *root,
3454 const char *name, int name_len,
Nikolay Borisova491abb2017-01-18 00:31:33 +02003455 struct btrfs_inode *inode, u64 dirid)
Chris Masone02119d2008-09-05 16:13:11 -04003456{
3457 struct btrfs_root *log;
3458 u64 index;
3459 int ret;
3460
Nikolay Borisova491abb2017-01-18 00:31:33 +02003461 if (inode->logged_trans < trans->transid)
Chris Mason3a5f1d42008-09-11 15:53:37 -04003462 return 0;
3463
Chris Masone02119d2008-09-05 16:13:11 -04003464 ret = join_running_log_trans(root);
3465 if (ret)
3466 return 0;
3467 log = root->log_root;
Nikolay Borisova491abb2017-01-18 00:31:33 +02003468 mutex_lock(&inode->log_mutex);
Chris Masone02119d2008-09-05 16:13:11 -04003469
Nikolay Borisova491abb2017-01-18 00:31:33 +02003470 ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
Chris Masone02119d2008-09-05 16:13:11 -04003471 dirid, &index);
Nikolay Borisova491abb2017-01-18 00:31:33 +02003472 mutex_unlock(&inode->log_mutex);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003473 if (ret == -ENOSPC) {
David Sterba90787762019-03-20 13:28:05 +01003474 btrfs_set_log_full_commit(trans);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003475 ret = 0;
Jeff Mahoney79787ea2012-03-12 16:03:00 +01003476 } else if (ret < 0 && ret != -ENOENT)
Jeff Mahoney66642832016-06-10 18:19:25 -04003477 btrfs_abort_transaction(trans, ret);
Chris Mason12fcfd22009-03-24 10:24:20 -04003478 btrfs_end_log_trans(root);
Chris Masone02119d2008-09-05 16:13:11 -04003479
Chris Masone02119d2008-09-05 16:13:11 -04003480 return ret;
3481}
3482
3483/*
3484 * creates a range item in the log for 'dirid'. first_offset and
3485 * last_offset tell us which parts of the key space the log should
3486 * be considered authoritative for.
3487 */
3488static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
3489 struct btrfs_root *log,
3490 struct btrfs_path *path,
3491 int key_type, u64 dirid,
3492 u64 first_offset, u64 last_offset)
3493{
3494 int ret;
3495 struct btrfs_key key;
3496 struct btrfs_dir_log_item *item;
3497
3498 key.objectid = dirid;
3499 key.offset = first_offset;
3500 if (key_type == BTRFS_DIR_ITEM_KEY)
3501 key.type = BTRFS_DIR_LOG_ITEM_KEY;
3502 else
3503 key.type = BTRFS_DIR_LOG_INDEX_KEY;
3504 ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003505 if (ret)
3506 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04003507
3508 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3509 struct btrfs_dir_log_item);
3510 btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
3511 btrfs_mark_buffer_dirty(path->nodes[0]);
David Sterbab3b4aa72011-04-21 01:20:15 +02003512 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04003513 return 0;
3514}
3515
3516/*
3517 * log all the items included in the current transaction for a given
3518 * directory. This also creates the range items in the log tree required
3519 * to replay anything deleted before the fsync
3520 */
3521static noinline int log_dir_items(struct btrfs_trans_handle *trans,
Nikolay Borisov684a5772017-01-18 00:31:41 +02003522 struct btrfs_root *root, struct btrfs_inode *inode,
Chris Masone02119d2008-09-05 16:13:11 -04003523 struct btrfs_path *path,
3524 struct btrfs_path *dst_path, int key_type,
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00003525 struct btrfs_log_ctx *ctx,
Chris Masone02119d2008-09-05 16:13:11 -04003526 u64 min_offset, u64 *last_offset_ret)
3527{
3528 struct btrfs_key min_key;
Chris Masone02119d2008-09-05 16:13:11 -04003529 struct btrfs_root *log = root->log_root;
3530 struct extent_buffer *src;
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003531 int err = 0;
Chris Masone02119d2008-09-05 16:13:11 -04003532 int ret;
3533 int i;
3534 int nritems;
3535 u64 first_offset = min_offset;
3536 u64 last_offset = (u64)-1;
Nikolay Borisov684a5772017-01-18 00:31:41 +02003537 u64 ino = btrfs_ino(inode);
Chris Masone02119d2008-09-05 16:13:11 -04003538
3539 log = root->log_root;
Chris Masone02119d2008-09-05 16:13:11 -04003540
Li Zefan33345d012011-04-20 10:31:50 +08003541 min_key.objectid = ino;
Chris Masone02119d2008-09-05 16:13:11 -04003542 min_key.type = key_type;
3543 min_key.offset = min_offset;
3544
Filipe David Borba Manana6174d3c2013-10-01 16:13:42 +01003545 ret = btrfs_search_forward(root, &min_key, path, trans->transid);
Chris Masone02119d2008-09-05 16:13:11 -04003546
3547 /*
3548 * we didn't find anything from this transaction, see if there
3549 * is anything at all
3550 */
Li Zefan33345d012011-04-20 10:31:50 +08003551 if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
3552 min_key.objectid = ino;
Chris Masone02119d2008-09-05 16:13:11 -04003553 min_key.type = key_type;
3554 min_key.offset = (u64)-1;
David Sterbab3b4aa72011-04-21 01:20:15 +02003555 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04003556 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
3557 if (ret < 0) {
David Sterbab3b4aa72011-04-21 01:20:15 +02003558 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04003559 return ret;
3560 }
Li Zefan33345d012011-04-20 10:31:50 +08003561 ret = btrfs_previous_item(root, path, ino, key_type);
Chris Masone02119d2008-09-05 16:13:11 -04003562
3563 /* if ret == 0 there are items for this type,
3564 * create a range to tell us the last key of this type.
3565 * otherwise, there are no items in this directory after
3566 * *min_offset, and we create a range to indicate that.
3567 */
3568 if (ret == 0) {
3569 struct btrfs_key tmp;
3570 btrfs_item_key_to_cpu(path->nodes[0], &tmp,
3571 path->slots[0]);
Chris Masond3977122009-01-05 21:25:51 -05003572 if (key_type == tmp.type)
Chris Masone02119d2008-09-05 16:13:11 -04003573 first_offset = max(min_offset, tmp.offset) + 1;
Chris Masone02119d2008-09-05 16:13:11 -04003574 }
3575 goto done;
3576 }
3577
3578 /* go backward to find any previous key */
Li Zefan33345d012011-04-20 10:31:50 +08003579 ret = btrfs_previous_item(root, path, ino, key_type);
Chris Masone02119d2008-09-05 16:13:11 -04003580 if (ret == 0) {
3581 struct btrfs_key tmp;
3582 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
3583 if (key_type == tmp.type) {
3584 first_offset = tmp.offset;
3585 ret = overwrite_item(trans, log, dst_path,
3586 path->nodes[0], path->slots[0],
3587 &tmp);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003588 if (ret) {
3589 err = ret;
3590 goto done;
3591 }
Chris Masone02119d2008-09-05 16:13:11 -04003592 }
3593 }
David Sterbab3b4aa72011-04-21 01:20:15 +02003594 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04003595
Josef Bacik2cc83342019-03-06 17:13:04 -05003596 /*
3597 * Find the first key from this transaction again. See the note for
3598 * log_new_dir_dentries, if we're logging a directory recursively we
3599 * won't be holding its i_mutex, which means we can modify the directory
3600 * while we're logging it. If we remove an entry between our first
3601 * search and this search we'll not find the key again and can just
3602 * bail.
3603 */
Chris Masone02119d2008-09-05 16:13:11 -04003604 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
Josef Bacik2cc83342019-03-06 17:13:04 -05003605 if (ret != 0)
Chris Masone02119d2008-09-05 16:13:11 -04003606 goto done;
Chris Masone02119d2008-09-05 16:13:11 -04003607
3608 /*
3609 * we have a block from this transaction, log every item in it
3610 * from our directory
3611 */
Chris Masond3977122009-01-05 21:25:51 -05003612 while (1) {
Chris Masone02119d2008-09-05 16:13:11 -04003613 struct btrfs_key tmp;
3614 src = path->nodes[0];
3615 nritems = btrfs_header_nritems(src);
3616 for (i = path->slots[0]; i < nritems; i++) {
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00003617 struct btrfs_dir_item *di;
3618
Chris Masone02119d2008-09-05 16:13:11 -04003619 btrfs_item_key_to_cpu(src, &min_key, i);
3620
Li Zefan33345d012011-04-20 10:31:50 +08003621 if (min_key.objectid != ino || min_key.type != key_type)
Chris Masone02119d2008-09-05 16:13:11 -04003622 goto done;
3623 ret = overwrite_item(trans, log, dst_path, src, i,
3624 &min_key);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003625 if (ret) {
3626 err = ret;
3627 goto done;
3628 }
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00003629
3630 /*
3631 * We must make sure that when we log a directory entry,
3632 * the corresponding inode, after log replay, has a
3633 * matching link count. For example:
3634 *
3635 * touch foo
3636 * mkdir mydir
3637 * sync
3638 * ln foo mydir/bar
3639 * xfs_io -c "fsync" mydir
3640 * <crash>
3641 * <mount fs and log replay>
3642 *
3643 * Would result in a fsync log that when replayed, our
3644 * file inode would have a link count of 1, but we get
3645 * two directory entries pointing to the same inode.
3646 * After removing one of the names, it would not be
3647 * possible to remove the other name, which resulted
3648 * always in stale file handle errors, and would not
3649 * be possible to rmdir the parent directory, since
3650 * its i_size could never decrement to the value
3651 * BTRFS_EMPTY_DIR_SIZE, resulting in -ENOTEMPTY errors.
3652 */
3653 di = btrfs_item_ptr(src, i, struct btrfs_dir_item);
3654 btrfs_dir_item_key_to_cpu(src, di, &tmp);
3655 if (ctx &&
3656 (btrfs_dir_transid(src, di) == trans->transid ||
3657 btrfs_dir_type(src, di) == BTRFS_FT_DIR) &&
3658 tmp.type != BTRFS_ROOT_ITEM_KEY)
3659 ctx->log_new_dentries = true;
Chris Masone02119d2008-09-05 16:13:11 -04003660 }
3661 path->slots[0] = nritems;
3662
3663 /*
3664 * look ahead to the next item and see if it is also
3665 * from this directory and from this transaction
3666 */
3667 ret = btrfs_next_leaf(root, path);
Liu Bo80c0b422018-04-03 01:59:47 +08003668 if (ret) {
3669 if (ret == 1)
3670 last_offset = (u64)-1;
3671 else
3672 err = ret;
Chris Masone02119d2008-09-05 16:13:11 -04003673 goto done;
3674 }
3675 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
Li Zefan33345d012011-04-20 10:31:50 +08003676 if (tmp.objectid != ino || tmp.type != key_type) {
Chris Masone02119d2008-09-05 16:13:11 -04003677 last_offset = (u64)-1;
3678 goto done;
3679 }
3680 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
3681 ret = overwrite_item(trans, log, dst_path,
3682 path->nodes[0], path->slots[0],
3683 &tmp);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003684 if (ret)
3685 err = ret;
3686 else
3687 last_offset = tmp.offset;
Chris Masone02119d2008-09-05 16:13:11 -04003688 goto done;
3689 }
3690 }
3691done:
David Sterbab3b4aa72011-04-21 01:20:15 +02003692 btrfs_release_path(path);
3693 btrfs_release_path(dst_path);
Chris Masone02119d2008-09-05 16:13:11 -04003694
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003695 if (err == 0) {
3696 *last_offset_ret = last_offset;
3697 /*
3698 * insert the log range keys to indicate where the log
3699 * is valid
3700 */
3701 ret = insert_dir_log_key(trans, log, path, key_type,
Li Zefan33345d012011-04-20 10:31:50 +08003702 ino, first_offset, last_offset);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003703 if (ret)
3704 err = ret;
3705 }
3706 return err;
Chris Masone02119d2008-09-05 16:13:11 -04003707}
3708
3709/*
3710 * logging directories is very similar to logging inodes, We find all the items
3711 * from the current transaction and write them to the log.
3712 *
3713 * The recovery code scans the directory in the subvolume, and if it finds a
3714 * key in the range logged that is not present in the log tree, then it means
3715 * that dir entry was unlinked during the transaction.
3716 *
3717 * In order for that scan to work, we must include one key smaller than
3718 * the smallest logged by this transaction and one key larger than the largest
3719 * key logged by this transaction.
3720 */
3721static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
Nikolay Borisovdbf39ea2017-01-18 00:31:42 +02003722 struct btrfs_root *root, struct btrfs_inode *inode,
Chris Masone02119d2008-09-05 16:13:11 -04003723 struct btrfs_path *path,
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00003724 struct btrfs_path *dst_path,
3725 struct btrfs_log_ctx *ctx)
Chris Masone02119d2008-09-05 16:13:11 -04003726{
3727 u64 min_key;
3728 u64 max_key;
3729 int ret;
3730 int key_type = BTRFS_DIR_ITEM_KEY;
3731
3732again:
3733 min_key = 0;
3734 max_key = 0;
Chris Masond3977122009-01-05 21:25:51 -05003735 while (1) {
Nikolay Borisovdbf39ea2017-01-18 00:31:42 +02003736 ret = log_dir_items(trans, root, inode, path, dst_path, key_type,
3737 ctx, min_key, &max_key);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003738 if (ret)
3739 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04003740 if (max_key == (u64)-1)
3741 break;
3742 min_key = max_key + 1;
3743 }
3744
3745 if (key_type == BTRFS_DIR_ITEM_KEY) {
3746 key_type = BTRFS_DIR_INDEX_KEY;
3747 goto again;
3748 }
3749 return 0;
3750}
3751
3752/*
3753 * a helper function to drop items from the log before we relog an
3754 * inode. max_key_type indicates the highest item type to remove.
3755 * This cannot be run for file data extents because it does not
3756 * free the extents they point to.
3757 */
3758static int drop_objectid_items(struct btrfs_trans_handle *trans,
3759 struct btrfs_root *log,
3760 struct btrfs_path *path,
3761 u64 objectid, int max_key_type)
3762{
3763 int ret;
3764 struct btrfs_key key;
3765 struct btrfs_key found_key;
Josef Bacik18ec90d2012-09-28 11:56:28 -04003766 int start_slot;
Chris Masone02119d2008-09-05 16:13:11 -04003767
3768 key.objectid = objectid;
3769 key.type = max_key_type;
3770 key.offset = (u64)-1;
3771
Chris Masond3977122009-01-05 21:25:51 -05003772 while (1) {
Chris Masone02119d2008-09-05 16:13:11 -04003773 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
Josef Bacik36508602013-04-25 16:23:32 -04003774 BUG_ON(ret == 0); /* Logic error */
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003775 if (ret < 0)
Chris Masone02119d2008-09-05 16:13:11 -04003776 break;
3777
3778 if (path->slots[0] == 0)
3779 break;
3780
3781 path->slots[0]--;
3782 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
3783 path->slots[0]);
3784
3785 if (found_key.objectid != objectid)
3786 break;
3787
Josef Bacik18ec90d2012-09-28 11:56:28 -04003788 found_key.offset = 0;
3789 found_key.type = 0;
3790 ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
3791 &start_slot);
Filipe Mananacbca7d52019-02-18 16:57:26 +00003792 if (ret < 0)
3793 break;
Josef Bacik18ec90d2012-09-28 11:56:28 -04003794
3795 ret = btrfs_del_items(trans, log, path, start_slot,
3796 path->slots[0] - start_slot + 1);
3797 /*
3798 * If start slot isn't 0 then we don't need to re-search, we've
3799 * found the last guy with the objectid in this tree.
3800 */
3801 if (ret || start_slot != 0)
Tsutomu Itoh65a246c2011-05-19 04:37:44 +00003802 break;
David Sterbab3b4aa72011-04-21 01:20:15 +02003803 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04003804 }
David Sterbab3b4aa72011-04-21 01:20:15 +02003805 btrfs_release_path(path);
Josef Bacik5bdbeb22012-05-29 16:59:49 -04003806 if (ret > 0)
3807 ret = 0;
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003808 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04003809}
3810
Josef Bacik94edf4a2012-09-25 14:56:25 -04003811static void fill_inode_item(struct btrfs_trans_handle *trans,
3812 struct extent_buffer *leaf,
3813 struct btrfs_inode_item *item,
Filipe Manana1a4bcf42015-02-13 12:30:56 +00003814 struct inode *inode, int log_inode_only,
3815 u64 logged_isize)
Josef Bacik94edf4a2012-09-25 14:56:25 -04003816{
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003817 struct btrfs_map_token token;
Josef Bacik94edf4a2012-09-25 14:56:25 -04003818
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003819 btrfs_init_map_token(&token);
Josef Bacik94edf4a2012-09-25 14:56:25 -04003820
3821 if (log_inode_only) {
3822 /* set the generation to zero so the recover code
3823 * can tell the difference between an logging
3824 * just to say 'this inode exists' and a logging
3825 * to say 'update this inode with these values'
3826 */
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003827 btrfs_set_token_inode_generation(leaf, item, 0, &token);
Filipe Manana1a4bcf42015-02-13 12:30:56 +00003828 btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
Josef Bacik94edf4a2012-09-25 14:56:25 -04003829 } else {
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003830 btrfs_set_token_inode_generation(leaf, item,
3831 BTRFS_I(inode)->generation,
3832 &token);
3833 btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
Josef Bacik94edf4a2012-09-25 14:56:25 -04003834 }
3835
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003836 btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3837 btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3838 btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3839 btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3840
David Sterbaa937b972014-12-12 17:39:12 +01003841 btrfs_set_token_timespec_sec(leaf, &item->atime,
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003842 inode->i_atime.tv_sec, &token);
David Sterbaa937b972014-12-12 17:39:12 +01003843 btrfs_set_token_timespec_nsec(leaf, &item->atime,
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003844 inode->i_atime.tv_nsec, &token);
3845
David Sterbaa937b972014-12-12 17:39:12 +01003846 btrfs_set_token_timespec_sec(leaf, &item->mtime,
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003847 inode->i_mtime.tv_sec, &token);
David Sterbaa937b972014-12-12 17:39:12 +01003848 btrfs_set_token_timespec_nsec(leaf, &item->mtime,
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003849 inode->i_mtime.tv_nsec, &token);
3850
David Sterbaa937b972014-12-12 17:39:12 +01003851 btrfs_set_token_timespec_sec(leaf, &item->ctime,
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003852 inode->i_ctime.tv_sec, &token);
David Sterbaa937b972014-12-12 17:39:12 +01003853 btrfs_set_token_timespec_nsec(leaf, &item->ctime,
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003854 inode->i_ctime.tv_nsec, &token);
3855
3856 btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3857 &token);
3858
Jeff Laytonc7f88c42017-12-11 06:35:12 -05003859 btrfs_set_token_inode_sequence(leaf, item,
3860 inode_peek_iversion(inode), &token);
Josef Bacik0b1c6cc2012-10-23 16:03:44 -04003861 btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3862 btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3863 btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3864 btrfs_set_token_inode_block_group(leaf, item, 0, &token);
Josef Bacik94edf4a2012-09-25 14:56:25 -04003865}
3866
Josef Bacika95249b2012-10-11 16:17:34 -04003867static int log_inode_item(struct btrfs_trans_handle *trans,
3868 struct btrfs_root *log, struct btrfs_path *path,
Nikolay Borisov6d889a32017-01-18 00:31:47 +02003869 struct btrfs_inode *inode)
Josef Bacika95249b2012-10-11 16:17:34 -04003870{
3871 struct btrfs_inode_item *inode_item;
Josef Bacika95249b2012-10-11 16:17:34 -04003872 int ret;
3873
Filipe David Borba Mananaefd0c402013-10-07 21:20:44 +01003874 ret = btrfs_insert_empty_item(trans, log, path,
Nikolay Borisov6d889a32017-01-18 00:31:47 +02003875 &inode->location, sizeof(*inode_item));
Josef Bacika95249b2012-10-11 16:17:34 -04003876 if (ret && ret != -EEXIST)
3877 return ret;
3878 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3879 struct btrfs_inode_item);
Nikolay Borisov6d889a32017-01-18 00:31:47 +02003880 fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode,
3881 0, 0);
Josef Bacika95249b2012-10-11 16:17:34 -04003882 btrfs_release_path(path);
3883 return 0;
3884}
3885
Chris Mason31ff1cd2008-09-11 16:17:57 -04003886static noinline int copy_items(struct btrfs_trans_handle *trans,
Nikolay Borisov44d70e12017-01-18 00:31:36 +02003887 struct btrfs_inode *inode,
Chris Mason31ff1cd2008-09-11 16:17:57 -04003888 struct btrfs_path *dst_path,
Josef Bacik16e75492013-10-22 12:18:51 -04003889 struct btrfs_path *src_path, u64 *last_extent,
Filipe Manana1a4bcf42015-02-13 12:30:56 +00003890 int start_slot, int nr, int inode_only,
3891 u64 logged_isize)
Chris Mason31ff1cd2008-09-11 16:17:57 -04003892{
David Sterba3ffbd682018-06-29 10:56:42 +02003893 struct btrfs_fs_info *fs_info = trans->fs_info;
Chris Mason31ff1cd2008-09-11 16:17:57 -04003894 unsigned long src_offset;
3895 unsigned long dst_offset;
Nikolay Borisov44d70e12017-01-18 00:31:36 +02003896 struct btrfs_root *log = inode->root->log_root;
Chris Mason31ff1cd2008-09-11 16:17:57 -04003897 struct btrfs_file_extent_item *extent;
3898 struct btrfs_inode_item *inode_item;
Josef Bacik16e75492013-10-22 12:18:51 -04003899 struct extent_buffer *src = src_path->nodes[0];
3900 struct btrfs_key first_key, last_key, key;
Chris Mason31ff1cd2008-09-11 16:17:57 -04003901 int ret;
3902 struct btrfs_key *ins_keys;
3903 u32 *ins_sizes;
3904 char *ins_data;
3905 int i;
Chris Masond20f7042008-12-08 16:58:54 -05003906 struct list_head ordered_sums;
Nikolay Borisov44d70e12017-01-18 00:31:36 +02003907 int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
Josef Bacik16e75492013-10-22 12:18:51 -04003908 bool has_extents = false;
Filipe Manana74121f7c2014-08-07 12:00:44 +01003909 bool need_find_last_extent = true;
Josef Bacik16e75492013-10-22 12:18:51 -04003910 bool done = false;
Chris Masond20f7042008-12-08 16:58:54 -05003911
3912 INIT_LIST_HEAD(&ordered_sums);
Chris Mason31ff1cd2008-09-11 16:17:57 -04003913
3914 ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
3915 nr * sizeof(u32), GFP_NOFS);
liubo2a29edc2011-01-26 06:22:08 +00003916 if (!ins_data)
3917 return -ENOMEM;
3918
Josef Bacik16e75492013-10-22 12:18:51 -04003919 first_key.objectid = (u64)-1;
3920
Chris Mason31ff1cd2008-09-11 16:17:57 -04003921 ins_sizes = (u32 *)ins_data;
3922 ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
3923
3924 for (i = 0; i < nr; i++) {
3925 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
3926 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
3927 }
3928 ret = btrfs_insert_empty_items(trans, log, dst_path,
3929 ins_keys, ins_sizes, nr);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04003930 if (ret) {
3931 kfree(ins_data);
3932 return ret;
3933 }
Chris Mason31ff1cd2008-09-11 16:17:57 -04003934
Yan Zheng5d4f98a2009-06-10 10:45:14 -04003935 for (i = 0; i < nr; i++, dst_path->slots[0]++) {
Chris Mason31ff1cd2008-09-11 16:17:57 -04003936 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
3937 dst_path->slots[0]);
3938
3939 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
3940
Matthias Kaehlcke0dde10b2017-07-27 14:30:23 -07003941 if (i == nr - 1)
Josef Bacik16e75492013-10-22 12:18:51 -04003942 last_key = ins_keys[i];
3943
Josef Bacik94edf4a2012-09-25 14:56:25 -04003944 if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
Chris Mason31ff1cd2008-09-11 16:17:57 -04003945 inode_item = btrfs_item_ptr(dst_path->nodes[0],
3946 dst_path->slots[0],
3947 struct btrfs_inode_item);
Josef Bacik94edf4a2012-09-25 14:56:25 -04003948 fill_inode_item(trans, dst_path->nodes[0], inode_item,
David Sterbaf85b7372017-01-20 14:54:07 +01003949 &inode->vfs_inode,
3950 inode_only == LOG_INODE_EXISTS,
Filipe Manana1a4bcf42015-02-13 12:30:56 +00003951 logged_isize);
Josef Bacik94edf4a2012-09-25 14:56:25 -04003952 } else {
3953 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
3954 src_offset, ins_sizes[i]);
Chris Mason31ff1cd2008-09-11 16:17:57 -04003955 }
Josef Bacik94edf4a2012-09-25 14:56:25 -04003956
Josef Bacik16e75492013-10-22 12:18:51 -04003957 /*
3958 * We set need_find_last_extent here in case we know we were
3959 * processing other items and then walk into the first extent in
3960 * the inode. If we don't hit an extent then nothing changes,
3961 * we'll do the last search the next time around.
3962 */
3963 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
3964 has_extents = true;
Filipe Manana74121f7c2014-08-07 12:00:44 +01003965 if (first_key.objectid == (u64)-1)
Josef Bacik16e75492013-10-22 12:18:51 -04003966 first_key = ins_keys[i];
3967 } else {
3968 need_find_last_extent = false;
3969 }
3970
Chris Mason31ff1cd2008-09-11 16:17:57 -04003971 /* take a reference on file data extents so that truncates
3972 * or deletes of this inode don't have to relog the inode
3973 * again
3974 */
David Sterba962a2982014-06-04 18:41:45 +02003975 if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY &&
Liu Bod2794402012-08-29 01:07:56 -06003976 !skip_csum) {
Chris Mason31ff1cd2008-09-11 16:17:57 -04003977 int found_type;
3978 extent = btrfs_item_ptr(src, start_slot + i,
3979 struct btrfs_file_extent_item);
3980
liubo8e531cd2011-05-06 10:36:09 +08003981 if (btrfs_file_extent_generation(src, extent) < trans->transid)
3982 continue;
3983
Chris Mason31ff1cd2008-09-11 16:17:57 -04003984 found_type = btrfs_file_extent_type(src, extent);
Josef Bacik6f1fed72012-09-26 11:07:06 -04003985 if (found_type == BTRFS_FILE_EXTENT_REG) {
Yan Zheng5d4f98a2009-06-10 10:45:14 -04003986 u64 ds, dl, cs, cl;
3987 ds = btrfs_file_extent_disk_bytenr(src,
3988 extent);
3989 /* ds == 0 is a hole */
3990 if (ds == 0)
3991 continue;
3992
3993 dl = btrfs_file_extent_disk_num_bytes(src,
3994 extent);
3995 cs = btrfs_file_extent_offset(src, extent);
3996 cl = btrfs_file_extent_num_bytes(src,
Joe Perchesa419aef2009-08-18 11:18:35 -07003997 extent);
Chris Mason580afd72008-12-08 19:15:39 -05003998 if (btrfs_file_extent_compression(src,
3999 extent)) {
4000 cs = 0;
4001 cl = dl;
4002 }
Yan Zheng5d4f98a2009-06-10 10:45:14 -04004003
4004 ret = btrfs_lookup_csums_range(
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004005 fs_info->csum_root,
Yan Zheng5d4f98a2009-06-10 10:45:14 -04004006 ds + cs, ds + cs + cl - 1,
Arne Jansena2de7332011-03-08 14:14:00 +01004007 &ordered_sums, 0);
Josef Bacik36508602013-04-25 16:23:32 -04004008 if (ret) {
4009 btrfs_release_path(dst_path);
4010 kfree(ins_data);
4011 return ret;
4012 }
Chris Mason31ff1cd2008-09-11 16:17:57 -04004013 }
4014 }
Chris Mason31ff1cd2008-09-11 16:17:57 -04004015 }
4016
4017 btrfs_mark_buffer_dirty(dst_path->nodes[0]);
David Sterbab3b4aa72011-04-21 01:20:15 +02004018 btrfs_release_path(dst_path);
Chris Mason31ff1cd2008-09-11 16:17:57 -04004019 kfree(ins_data);
Chris Masond20f7042008-12-08 16:58:54 -05004020
4021 /*
4022 * we have to do this after the loop above to avoid changing the
4023 * log tree while trying to change the log tree.
4024 */
Yan, Zheng4a500fd2010-05-16 10:49:59 -04004025 ret = 0;
Chris Masond3977122009-01-05 21:25:51 -05004026 while (!list_empty(&ordered_sums)) {
Chris Masond20f7042008-12-08 16:58:54 -05004027 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4028 struct btrfs_ordered_sum,
4029 list);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04004030 if (!ret)
4031 ret = btrfs_csum_file_blocks(trans, log, sums);
Chris Masond20f7042008-12-08 16:58:54 -05004032 list_del(&sums->list);
4033 kfree(sums);
4034 }
Josef Bacik16e75492013-10-22 12:18:51 -04004035
4036 if (!has_extents)
4037 return ret;
4038
Filipe Manana74121f7c2014-08-07 12:00:44 +01004039 if (need_find_last_extent && *last_extent == first_key.offset) {
4040 /*
4041 * We don't have any leafs between our current one and the one
4042 * we processed before that can have file extent items for our
4043 * inode (and have a generation number smaller than our current
4044 * transaction id).
4045 */
4046 need_find_last_extent = false;
4047 }
4048
Josef Bacik16e75492013-10-22 12:18:51 -04004049 /*
4050 * Because we use btrfs_search_forward we could skip leaves that were
4051 * not modified and then assume *last_extent is valid when it really
4052 * isn't. So back up to the previous leaf and read the end of the last
4053 * extent before we go and fill in holes.
4054 */
4055 if (need_find_last_extent) {
4056 u64 len;
4057
Nikolay Borisov44d70e12017-01-18 00:31:36 +02004058 ret = btrfs_prev_leaf(inode->root, src_path);
Josef Bacik16e75492013-10-22 12:18:51 -04004059 if (ret < 0)
4060 return ret;
4061 if (ret)
4062 goto fill_holes;
4063 if (src_path->slots[0])
4064 src_path->slots[0]--;
4065 src = src_path->nodes[0];
4066 btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
Nikolay Borisov44d70e12017-01-18 00:31:36 +02004067 if (key.objectid != btrfs_ino(inode) ||
Josef Bacik16e75492013-10-22 12:18:51 -04004068 key.type != BTRFS_EXTENT_DATA_KEY)
4069 goto fill_holes;
4070 extent = btrfs_item_ptr(src, src_path->slots[0],
4071 struct btrfs_file_extent_item);
4072 if (btrfs_file_extent_type(src, extent) ==
4073 BTRFS_FILE_EXTENT_INLINE) {
Qu Wenruoe41ca582018-06-06 15:41:49 +08004074 len = btrfs_file_extent_ram_bytes(src, extent);
Josef Bacik16e75492013-10-22 12:18:51 -04004075 *last_extent = ALIGN(key.offset + len,
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004076 fs_info->sectorsize);
Josef Bacik16e75492013-10-22 12:18:51 -04004077 } else {
4078 len = btrfs_file_extent_num_bytes(src, extent);
4079 *last_extent = key.offset + len;
4080 }
4081 }
4082fill_holes:
4083 /* So we did prev_leaf, now we need to move to the next leaf, but a few
4084 * things could have happened
4085 *
4086 * 1) A merge could have happened, so we could currently be on a leaf
4087 * that holds what we were copying in the first place.
4088 * 2) A split could have happened, and now not all of the items we want
4089 * are on the same leaf.
4090 *
4091 * So we need to adjust how we search for holes, we need to drop the
4092 * path and re-search for the first extent key we found, and then walk
4093 * forward until we hit the last one we copied.
4094 */
4095 if (need_find_last_extent) {
4096 /* btrfs_prev_leaf could return 1 without releasing the path */
4097 btrfs_release_path(src_path);
David Sterbaf85b7372017-01-20 14:54:07 +01004098 ret = btrfs_search_slot(NULL, inode->root, &first_key,
4099 src_path, 0, 0);
Josef Bacik16e75492013-10-22 12:18:51 -04004100 if (ret < 0)
4101 return ret;
4102 ASSERT(ret == 0);
4103 src = src_path->nodes[0];
4104 i = src_path->slots[0];
4105 } else {
4106 i = start_slot;
4107 }
4108
4109 /*
4110 * Ok so here we need to go through and fill in any holes we may have
4111 * to make sure that holes are punched for those areas in case they had
4112 * extents previously.
4113 */
4114 while (!done) {
4115 u64 offset, len;
4116 u64 extent_end;
4117
4118 if (i >= btrfs_header_nritems(src_path->nodes[0])) {
Nikolay Borisov44d70e12017-01-18 00:31:36 +02004119 ret = btrfs_next_leaf(inode->root, src_path);
Josef Bacik16e75492013-10-22 12:18:51 -04004120 if (ret < 0)
4121 return ret;
4122 ASSERT(ret == 0);
4123 src = src_path->nodes[0];
4124 i = 0;
Filipe Manana8434ec42018-03-26 23:59:12 +01004125 need_find_last_extent = true;
Josef Bacik16e75492013-10-22 12:18:51 -04004126 }
4127
4128 btrfs_item_key_to_cpu(src, &key, i);
4129 if (!btrfs_comp_cpu_keys(&key, &last_key))
4130 done = true;
Nikolay Borisov44d70e12017-01-18 00:31:36 +02004131 if (key.objectid != btrfs_ino(inode) ||
Josef Bacik16e75492013-10-22 12:18:51 -04004132 key.type != BTRFS_EXTENT_DATA_KEY) {
4133 i++;
4134 continue;
4135 }
4136 extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
4137 if (btrfs_file_extent_type(src, extent) ==
4138 BTRFS_FILE_EXTENT_INLINE) {
Qu Wenruoe41ca582018-06-06 15:41:49 +08004139 len = btrfs_file_extent_ram_bytes(src, extent);
Jeff Mahoneyda170662016-06-15 09:22:56 -04004140 extent_end = ALIGN(key.offset + len,
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004141 fs_info->sectorsize);
Josef Bacik16e75492013-10-22 12:18:51 -04004142 } else {
4143 len = btrfs_file_extent_num_bytes(src, extent);
4144 extent_end = key.offset + len;
4145 }
4146 i++;
4147
4148 if (*last_extent == key.offset) {
4149 *last_extent = extent_end;
4150 continue;
4151 }
4152 offset = *last_extent;
4153 len = key.offset - *last_extent;
Nikolay Borisov44d70e12017-01-18 00:31:36 +02004154 ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
David Sterbaf85b7372017-01-20 14:54:07 +01004155 offset, 0, 0, len, 0, len, 0, 0, 0);
Josef Bacik16e75492013-10-22 12:18:51 -04004156 if (ret)
4157 break;
Filipe Manana74121f7c2014-08-07 12:00:44 +01004158 *last_extent = extent_end;
Josef Bacik16e75492013-10-22 12:18:51 -04004159 }
Filipe Manana4ee3fad2018-03-26 23:59:00 +01004160
4161 /*
4162 * Check if there is a hole between the last extent found in our leaf
4163 * and the first extent in the next leaf. If there is one, we need to
4164 * log an explicit hole so that at replay time we can punch the hole.
4165 */
4166 if (ret == 0 &&
4167 key.objectid == btrfs_ino(inode) &&
4168 key.type == BTRFS_EXTENT_DATA_KEY &&
4169 i == btrfs_header_nritems(src_path->nodes[0])) {
4170 ret = btrfs_next_leaf(inode->root, src_path);
4171 need_find_last_extent = true;
4172 if (ret > 0) {
4173 ret = 0;
4174 } else if (ret == 0) {
4175 btrfs_item_key_to_cpu(src_path->nodes[0], &key,
4176 src_path->slots[0]);
4177 if (key.objectid == btrfs_ino(inode) &&
4178 key.type == BTRFS_EXTENT_DATA_KEY &&
4179 *last_extent < key.offset) {
4180 const u64 len = key.offset - *last_extent;
4181
4182 ret = btrfs_insert_file_extent(trans, log,
4183 btrfs_ino(inode),
4184 *last_extent, 0,
4185 0, len, 0, len,
4186 0, 0, 0);
4187 }
4188 }
4189 }
Josef Bacik16e75492013-10-22 12:18:51 -04004190 /*
4191 * Need to let the callers know we dropped the path so they should
4192 * re-search.
4193 */
4194 if (!ret && need_find_last_extent)
4195 ret = 1;
Yan, Zheng4a500fd2010-05-16 10:49:59 -04004196 return ret;
Chris Mason31ff1cd2008-09-11 16:17:57 -04004197}
4198
Josef Bacik5dc562c2012-08-17 13:14:17 -04004199static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
4200{
4201 struct extent_map *em1, *em2;
4202
4203 em1 = list_entry(a, struct extent_map, list);
4204 em2 = list_entry(b, struct extent_map, list);
4205
4206 if (em1->start < em2->start)
4207 return -1;
4208 else if (em1->start > em2->start)
4209 return 1;
4210 return 0;
4211}
4212
Josef Bacike7175a62018-05-23 11:58:34 -04004213static int log_extent_csums(struct btrfs_trans_handle *trans,
4214 struct btrfs_inode *inode,
Nikolay Borisova9ecb652018-06-20 17:26:42 +03004215 struct btrfs_root *log_root,
Josef Bacike7175a62018-05-23 11:58:34 -04004216 const struct extent_map *em)
Josef Bacik5dc562c2012-08-17 13:14:17 -04004217{
Josef Bacik2ab28f32012-10-12 15:27:49 -04004218 u64 csum_offset;
4219 u64 csum_len;
Filipe Manana8407f552014-09-05 15:14:39 +01004220 LIST_HEAD(ordered_sums);
4221 int ret = 0;
Josef Bacik09a2a8f92013-04-05 16:51:15 -04004222
Josef Bacike7175a62018-05-23 11:58:34 -04004223 if (inode->flags & BTRFS_INODE_NODATASUM ||
4224 test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
Filipe Manana8407f552014-09-05 15:14:39 +01004225 em->block_start == EXTENT_MAP_HOLE)
Josef Bacik70c8a912012-10-11 16:54:30 -04004226 return 0;
4227
Josef Bacike7175a62018-05-23 11:58:34 -04004228 /* If we're compressed we have to save the entire range of csums. */
Filipe David Borba Manana488111a2013-10-28 16:30:29 +00004229 if (em->compress_type) {
4230 csum_offset = 0;
Filipe Manana8407f552014-09-05 15:14:39 +01004231 csum_len = max(em->block_len, em->orig_block_len);
Filipe David Borba Manana488111a2013-10-28 16:30:29 +00004232 } else {
Josef Bacike7175a62018-05-23 11:58:34 -04004233 csum_offset = em->mod_start - em->start;
4234 csum_len = em->mod_len;
Filipe David Borba Manana488111a2013-10-28 16:30:29 +00004235 }
Josef Bacik2ab28f32012-10-12 15:27:49 -04004236
Josef Bacik70c8a912012-10-11 16:54:30 -04004237 /* block start is already adjusted for the file extent offset. */
Nikolay Borisova9ecb652018-06-20 17:26:42 +03004238 ret = btrfs_lookup_csums_range(trans->fs_info->csum_root,
Josef Bacik70c8a912012-10-11 16:54:30 -04004239 em->block_start + csum_offset,
4240 em->block_start + csum_offset +
4241 csum_len - 1, &ordered_sums, 0);
4242 if (ret)
4243 return ret;
4244
4245 while (!list_empty(&ordered_sums)) {
4246 struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
4247 struct btrfs_ordered_sum,
4248 list);
4249 if (!ret)
Nikolay Borisova9ecb652018-06-20 17:26:42 +03004250 ret = btrfs_csum_file_blocks(trans, log_root, sums);
Josef Bacik70c8a912012-10-11 16:54:30 -04004251 list_del(&sums->list);
4252 kfree(sums);
4253 }
4254
4255 return ret;
Josef Bacik5dc562c2012-08-17 13:14:17 -04004256}
4257
Filipe Manana8407f552014-09-05 15:14:39 +01004258static int log_one_extent(struct btrfs_trans_handle *trans,
Nikolay Borisov9d122622017-01-18 00:31:40 +02004259 struct btrfs_inode *inode, struct btrfs_root *root,
Filipe Manana8407f552014-09-05 15:14:39 +01004260 const struct extent_map *em,
4261 struct btrfs_path *path,
Filipe Manana8407f552014-09-05 15:14:39 +01004262 struct btrfs_log_ctx *ctx)
4263{
4264 struct btrfs_root *log = root->log_root;
4265 struct btrfs_file_extent_item *fi;
4266 struct extent_buffer *leaf;
4267 struct btrfs_map_token token;
4268 struct btrfs_key key;
4269 u64 extent_offset = em->start - em->orig_start;
4270 u64 block_len;
4271 int ret;
4272 int extent_inserted = 0;
Filipe Manana8407f552014-09-05 15:14:39 +01004273
Nikolay Borisova9ecb652018-06-20 17:26:42 +03004274 ret = log_extent_csums(trans, inode, log, em);
Filipe Manana8407f552014-09-05 15:14:39 +01004275 if (ret)
4276 return ret;
4277
Filipe Manana8407f552014-09-05 15:14:39 +01004278 btrfs_init_map_token(&token);
4279
Nikolay Borisov9d122622017-01-18 00:31:40 +02004280 ret = __btrfs_drop_extents(trans, log, &inode->vfs_inode, path, em->start,
Filipe Manana8407f552014-09-05 15:14:39 +01004281 em->start + em->len, NULL, 0, 1,
4282 sizeof(*fi), &extent_inserted);
4283 if (ret)
4284 return ret;
4285
4286 if (!extent_inserted) {
Nikolay Borisov9d122622017-01-18 00:31:40 +02004287 key.objectid = btrfs_ino(inode);
Filipe Manana8407f552014-09-05 15:14:39 +01004288 key.type = BTRFS_EXTENT_DATA_KEY;
4289 key.offset = em->start;
4290
4291 ret = btrfs_insert_empty_item(trans, log, path, &key,
4292 sizeof(*fi));
4293 if (ret)
4294 return ret;
4295 }
4296 leaf = path->nodes[0];
4297 fi = btrfs_item_ptr(leaf, path->slots[0],
4298 struct btrfs_file_extent_item);
4299
Josef Bacik50d9aa92014-11-21 14:52:38 -05004300 btrfs_set_token_file_extent_generation(leaf, fi, trans->transid,
Filipe Manana8407f552014-09-05 15:14:39 +01004301 &token);
4302 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
4303 btrfs_set_token_file_extent_type(leaf, fi,
4304 BTRFS_FILE_EXTENT_PREALLOC,
4305 &token);
4306 else
4307 btrfs_set_token_file_extent_type(leaf, fi,
4308 BTRFS_FILE_EXTENT_REG,
4309 &token);
4310
4311 block_len = max(em->block_len, em->orig_block_len);
4312 if (em->compress_type != BTRFS_COMPRESS_NONE) {
4313 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4314 em->block_start,
4315 &token);
4316 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4317 &token);
4318 } else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
4319 btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
4320 em->block_start -
4321 extent_offset, &token);
4322 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
4323 &token);
4324 } else {
4325 btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
4326 btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
4327 &token);
4328 }
4329
4330 btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token);
4331 btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
4332 btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
4333 btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
4334 &token);
4335 btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
4336 btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
4337 btrfs_mark_buffer_dirty(leaf);
4338
4339 btrfs_release_path(path);
4340
4341 return ret;
4342}
4343
Filipe Manana31d11b82018-05-09 16:01:46 +01004344/*
4345 * Log all prealloc extents beyond the inode's i_size to make sure we do not
4346 * lose them after doing a fast fsync and replaying the log. We scan the
4347 * subvolume's root instead of iterating the inode's extent map tree because
4348 * otherwise we can log incorrect extent items based on extent map conversion.
4349 * That can happen due to the fact that extent maps are merged when they
4350 * are not in the extent map tree's list of modified extents.
4351 */
4352static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
4353 struct btrfs_inode *inode,
4354 struct btrfs_path *path)
4355{
4356 struct btrfs_root *root = inode->root;
4357 struct btrfs_key key;
4358 const u64 i_size = i_size_read(&inode->vfs_inode);
4359 const u64 ino = btrfs_ino(inode);
4360 struct btrfs_path *dst_path = NULL;
4361 u64 last_extent = (u64)-1;
4362 int ins_nr = 0;
4363 int start_slot;
4364 int ret;
4365
4366 if (!(inode->flags & BTRFS_INODE_PREALLOC))
4367 return 0;
4368
4369 key.objectid = ino;
4370 key.type = BTRFS_EXTENT_DATA_KEY;
4371 key.offset = i_size;
4372 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4373 if (ret < 0)
4374 goto out;
4375
4376 while (true) {
4377 struct extent_buffer *leaf = path->nodes[0];
4378 int slot = path->slots[0];
4379
4380 if (slot >= btrfs_header_nritems(leaf)) {
4381 if (ins_nr > 0) {
4382 ret = copy_items(trans, inode, dst_path, path,
4383 &last_extent, start_slot,
4384 ins_nr, 1, 0);
4385 if (ret < 0)
4386 goto out;
4387 ins_nr = 0;
4388 }
4389 ret = btrfs_next_leaf(root, path);
4390 if (ret < 0)
4391 goto out;
4392 if (ret > 0) {
4393 ret = 0;
4394 break;
4395 }
4396 continue;
4397 }
4398
4399 btrfs_item_key_to_cpu(leaf, &key, slot);
4400 if (key.objectid > ino)
4401 break;
4402 if (WARN_ON_ONCE(key.objectid < ino) ||
4403 key.type < BTRFS_EXTENT_DATA_KEY ||
4404 key.offset < i_size) {
4405 path->slots[0]++;
4406 continue;
4407 }
4408 if (last_extent == (u64)-1) {
4409 last_extent = key.offset;
4410 /*
4411 * Avoid logging extent items logged in past fsync calls
4412 * and leading to duplicate keys in the log tree.
4413 */
4414 do {
4415 ret = btrfs_truncate_inode_items(trans,
4416 root->log_root,
4417 &inode->vfs_inode,
4418 i_size,
4419 BTRFS_EXTENT_DATA_KEY);
4420 } while (ret == -EAGAIN);
4421 if (ret)
4422 goto out;
4423 }
4424 if (ins_nr == 0)
4425 start_slot = slot;
4426 ins_nr++;
4427 path->slots[0]++;
4428 if (!dst_path) {
4429 dst_path = btrfs_alloc_path();
4430 if (!dst_path) {
4431 ret = -ENOMEM;
4432 goto out;
4433 }
4434 }
4435 }
4436 if (ins_nr > 0) {
4437 ret = copy_items(trans, inode, dst_path, path, &last_extent,
4438 start_slot, ins_nr, 1, 0);
4439 if (ret > 0)
4440 ret = 0;
4441 }
4442out:
4443 btrfs_release_path(path);
4444 btrfs_free_path(dst_path);
4445 return ret;
4446}
4447
Josef Bacik5dc562c2012-08-17 13:14:17 -04004448static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
4449 struct btrfs_root *root,
Nikolay Borisov9d122622017-01-18 00:31:40 +02004450 struct btrfs_inode *inode,
Miao Xie827463c2014-01-14 20:31:51 +08004451 struct btrfs_path *path,
Filipe Mananade0ee0e2016-01-21 10:17:54 +00004452 struct btrfs_log_ctx *ctx,
4453 const u64 start,
4454 const u64 end)
Josef Bacik5dc562c2012-08-17 13:14:17 -04004455{
Josef Bacik5dc562c2012-08-17 13:14:17 -04004456 struct extent_map *em, *n;
4457 struct list_head extents;
Nikolay Borisov9d122622017-01-18 00:31:40 +02004458 struct extent_map_tree *tree = &inode->extent_tree;
Josef Bacik5dc562c2012-08-17 13:14:17 -04004459 u64 test_gen;
4460 int ret = 0;
Josef Bacik2ab28f32012-10-12 15:27:49 -04004461 int num = 0;
Josef Bacik5dc562c2012-08-17 13:14:17 -04004462
4463 INIT_LIST_HEAD(&extents);
4464
Josef Bacik5dc562c2012-08-17 13:14:17 -04004465 write_lock(&tree->lock);
4466 test_gen = root->fs_info->last_trans_committed;
4467
4468 list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
Filipe Manana008c6752018-10-29 09:42:06 +00004469 /*
4470 * Skip extents outside our logging range. It's important to do
4471 * it for correctness because if we don't ignore them, we may
4472 * log them before their ordered extent completes, and therefore
4473 * we could log them without logging their respective checksums
4474 * (the checksum items are added to the csum tree at the very
4475 * end of btrfs_finish_ordered_io()). Also leave such extents
4476 * outside of our range in the list, since we may have another
4477 * ranged fsync in the near future that needs them. If an extent
4478 * outside our range corresponds to a hole, log it to avoid
4479 * leaving gaps between extents (fsck will complain when we are
4480 * not using the NO_HOLES feature).
4481 */
4482 if ((em->start > end || em->start + em->len <= start) &&
4483 em->block_start != EXTENT_MAP_HOLE)
4484 continue;
4485
Josef Bacik5dc562c2012-08-17 13:14:17 -04004486 list_del_init(&em->list);
Josef Bacik2ab28f32012-10-12 15:27:49 -04004487 /*
4488 * Just an arbitrary number, this can be really CPU intensive
4489 * once we start getting a lot of extents, and really once we
4490 * have a bunch of extents we just want to commit since it will
4491 * be faster.
4492 */
4493 if (++num > 32768) {
4494 list_del_init(&tree->modified_extents);
4495 ret = -EFBIG;
4496 goto process;
4497 }
4498
Josef Bacik5dc562c2012-08-17 13:14:17 -04004499 if (em->generation <= test_gen)
4500 continue;
Josef Bacik8c6c5922017-08-29 10:11:39 -04004501
Filipe Manana31d11b82018-05-09 16:01:46 +01004502 /* We log prealloc extents beyond eof later. */
4503 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
4504 em->start >= i_size_read(&inode->vfs_inode))
4505 continue;
4506
Josef Bacikff44c6e2012-09-14 12:59:20 -04004507 /* Need a ref to keep it from getting evicted from cache */
Elena Reshetova490b54d2017-03-03 10:55:12 +02004508 refcount_inc(&em->refs);
Josef Bacikff44c6e2012-09-14 12:59:20 -04004509 set_bit(EXTENT_FLAG_LOGGING, &em->flags);
Josef Bacik5dc562c2012-08-17 13:14:17 -04004510 list_add_tail(&em->list, &extents);
Josef Bacik2ab28f32012-10-12 15:27:49 -04004511 num++;
Josef Bacik5dc562c2012-08-17 13:14:17 -04004512 }
4513
4514 list_sort(NULL, &extents, extent_cmp);
Josef Bacik2ab28f32012-10-12 15:27:49 -04004515process:
Josef Bacik5dc562c2012-08-17 13:14:17 -04004516 while (!list_empty(&extents)) {
4517 em = list_entry(extents.next, struct extent_map, list);
4518
4519 list_del_init(&em->list);
4520
4521 /*
4522 * If we had an error we just need to delete everybody from our
4523 * private list.
4524 */
Josef Bacikff44c6e2012-09-14 12:59:20 -04004525 if (ret) {
Josef Bacik201a9032013-01-24 12:02:07 -05004526 clear_em_logging(tree, em);
Josef Bacikff44c6e2012-09-14 12:59:20 -04004527 free_extent_map(em);
Josef Bacik5dc562c2012-08-17 13:14:17 -04004528 continue;
Josef Bacikff44c6e2012-09-14 12:59:20 -04004529 }
4530
4531 write_unlock(&tree->lock);
Josef Bacik5dc562c2012-08-17 13:14:17 -04004532
Josef Bacika2120a42018-05-23 11:58:35 -04004533 ret = log_one_extent(trans, inode, root, em, path, ctx);
Josef Bacikff44c6e2012-09-14 12:59:20 -04004534 write_lock(&tree->lock);
Josef Bacik201a9032013-01-24 12:02:07 -05004535 clear_em_logging(tree, em);
4536 free_extent_map(em);
Josef Bacik5dc562c2012-08-17 13:14:17 -04004537 }
Josef Bacikff44c6e2012-09-14 12:59:20 -04004538 WARN_ON(!list_empty(&extents));
4539 write_unlock(&tree->lock);
Josef Bacik5dc562c2012-08-17 13:14:17 -04004540
Josef Bacik5dc562c2012-08-17 13:14:17 -04004541 btrfs_release_path(path);
Filipe Manana31d11b82018-05-09 16:01:46 +01004542 if (!ret)
4543 ret = btrfs_log_prealloc_extents(trans, inode, path);
4544
Josef Bacik5dc562c2012-08-17 13:14:17 -04004545 return ret;
4546}
4547
Nikolay Borisov481b01c2017-01-18 00:31:34 +02004548static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode,
Filipe Manana1a4bcf42015-02-13 12:30:56 +00004549 struct btrfs_path *path, u64 *size_ret)
4550{
4551 struct btrfs_key key;
4552 int ret;
4553
Nikolay Borisov481b01c2017-01-18 00:31:34 +02004554 key.objectid = btrfs_ino(inode);
Filipe Manana1a4bcf42015-02-13 12:30:56 +00004555 key.type = BTRFS_INODE_ITEM_KEY;
4556 key.offset = 0;
4557
4558 ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
4559 if (ret < 0) {
4560 return ret;
4561 } else if (ret > 0) {
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00004562 *size_ret = 0;
Filipe Manana1a4bcf42015-02-13 12:30:56 +00004563 } else {
4564 struct btrfs_inode_item *item;
4565
4566 item = btrfs_item_ptr(path->nodes[0], path->slots[0],
4567 struct btrfs_inode_item);
4568 *size_ret = btrfs_inode_size(path->nodes[0], item);
Filipe Mananabf504112019-03-04 14:06:12 +00004569 /*
4570 * If the in-memory inode's i_size is smaller then the inode
4571 * size stored in the btree, return the inode's i_size, so
4572 * that we get a correct inode size after replaying the log
4573 * when before a power failure we had a shrinking truncate
4574 * followed by addition of a new name (rename / new hard link).
4575 * Otherwise return the inode size from the btree, to avoid
4576 * data loss when replaying a log due to previously doing a
4577 * write that expands the inode's size and logging a new name
4578 * immediately after.
4579 */
4580 if (*size_ret > inode->vfs_inode.i_size)
4581 *size_ret = inode->vfs_inode.i_size;
Filipe Manana1a4bcf42015-02-13 12:30:56 +00004582 }
4583
4584 btrfs_release_path(path);
4585 return 0;
4586}
4587
Filipe Manana36283bf2015-06-20 00:44:51 +01004588/*
4589 * At the moment we always log all xattrs. This is to figure out at log replay
4590 * time which xattrs must have their deletion replayed. If a xattr is missing
4591 * in the log tree and exists in the fs/subvol tree, we delete it. This is
4592 * because if a xattr is deleted, the inode is fsynced and a power failure
4593 * happens, causing the log to be replayed the next time the fs is mounted,
4594 * we want the xattr to not exist anymore (same behaviour as other filesystems
4595 * with a journal, ext3/4, xfs, f2fs, etc).
4596 */
4597static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
4598 struct btrfs_root *root,
Nikolay Borisov1a93c362017-01-18 00:31:37 +02004599 struct btrfs_inode *inode,
Filipe Manana36283bf2015-06-20 00:44:51 +01004600 struct btrfs_path *path,
4601 struct btrfs_path *dst_path)
4602{
4603 int ret;
4604 struct btrfs_key key;
Nikolay Borisov1a93c362017-01-18 00:31:37 +02004605 const u64 ino = btrfs_ino(inode);
Filipe Manana36283bf2015-06-20 00:44:51 +01004606 int ins_nr = 0;
4607 int start_slot = 0;
4608
4609 key.objectid = ino;
4610 key.type = BTRFS_XATTR_ITEM_KEY;
4611 key.offset = 0;
4612
4613 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4614 if (ret < 0)
4615 return ret;
4616
4617 while (true) {
4618 int slot = path->slots[0];
4619 struct extent_buffer *leaf = path->nodes[0];
4620 int nritems = btrfs_header_nritems(leaf);
4621
4622 if (slot >= nritems) {
4623 if (ins_nr > 0) {
4624 u64 last_extent = 0;
4625
Nikolay Borisov1a93c362017-01-18 00:31:37 +02004626 ret = copy_items(trans, inode, dst_path, path,
Filipe Manana36283bf2015-06-20 00:44:51 +01004627 &last_extent, start_slot,
4628 ins_nr, 1, 0);
4629 /* can't be 1, extent items aren't processed */
4630 ASSERT(ret <= 0);
4631 if (ret < 0)
4632 return ret;
4633 ins_nr = 0;
4634 }
4635 ret = btrfs_next_leaf(root, path);
4636 if (ret < 0)
4637 return ret;
4638 else if (ret > 0)
4639 break;
4640 continue;
4641 }
4642
4643 btrfs_item_key_to_cpu(leaf, &key, slot);
4644 if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY)
4645 break;
4646
4647 if (ins_nr == 0)
4648 start_slot = slot;
4649 ins_nr++;
4650 path->slots[0]++;
4651 cond_resched();
4652 }
4653 if (ins_nr > 0) {
4654 u64 last_extent = 0;
4655
Nikolay Borisov1a93c362017-01-18 00:31:37 +02004656 ret = copy_items(trans, inode, dst_path, path,
Filipe Manana36283bf2015-06-20 00:44:51 +01004657 &last_extent, start_slot,
4658 ins_nr, 1, 0);
4659 /* can't be 1, extent items aren't processed */
4660 ASSERT(ret <= 0);
4661 if (ret < 0)
4662 return ret;
4663 }
4664
4665 return 0;
4666}
4667
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004668/*
4669 * If the no holes feature is enabled we need to make sure any hole between the
4670 * last extent and the i_size of our inode is explicitly marked in the log. This
4671 * is to make sure that doing something like:
4672 *
4673 * 1) create file with 128Kb of data
4674 * 2) truncate file to 64Kb
4675 * 3) truncate file to 256Kb
4676 * 4) fsync file
4677 * 5) <crash/power failure>
4678 * 6) mount fs and trigger log replay
4679 *
4680 * Will give us a file with a size of 256Kb, the first 64Kb of data match what
4681 * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
4682 * file correspond to a hole. The presence of explicit holes in a log tree is
4683 * what guarantees that log replay will remove/adjust file extent items in the
4684 * fs/subvol tree.
4685 *
4686 * Here we do not need to care about holes between extents, that is already done
4687 * by copy_items(). We also only need to do this in the full sync path, where we
4688 * lookup for extents from the fs/subvol tree only. In the fast path case, we
4689 * lookup the list of modified extent maps and if any represents a hole, we
4690 * insert a corresponding extent representing a hole in the log tree.
4691 */
4692static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
4693 struct btrfs_root *root,
Nikolay Borisova0308dd2017-01-18 00:31:38 +02004694 struct btrfs_inode *inode,
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004695 struct btrfs_path *path)
4696{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004697 struct btrfs_fs_info *fs_info = root->fs_info;
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004698 int ret;
4699 struct btrfs_key key;
4700 u64 hole_start;
4701 u64 hole_size;
4702 struct extent_buffer *leaf;
4703 struct btrfs_root *log = root->log_root;
Nikolay Borisova0308dd2017-01-18 00:31:38 +02004704 const u64 ino = btrfs_ino(inode);
4705 const u64 i_size = i_size_read(&inode->vfs_inode);
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004706
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004707 if (!btrfs_fs_incompat(fs_info, NO_HOLES))
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004708 return 0;
4709
4710 key.objectid = ino;
4711 key.type = BTRFS_EXTENT_DATA_KEY;
4712 key.offset = (u64)-1;
4713
4714 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4715 ASSERT(ret != 0);
4716 if (ret < 0)
4717 return ret;
4718
4719 ASSERT(path->slots[0] > 0);
4720 path->slots[0]--;
4721 leaf = path->nodes[0];
4722 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4723
4724 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
4725 /* inode does not have any extents */
4726 hole_start = 0;
4727 hole_size = i_size;
4728 } else {
4729 struct btrfs_file_extent_item *extent;
4730 u64 len;
4731
4732 /*
4733 * If there's an extent beyond i_size, an explicit hole was
4734 * already inserted by copy_items().
4735 */
4736 if (key.offset >= i_size)
4737 return 0;
4738
4739 extent = btrfs_item_ptr(leaf, path->slots[0],
4740 struct btrfs_file_extent_item);
4741
4742 if (btrfs_file_extent_type(leaf, extent) ==
Filipe Manana0ccc3872019-03-19 17:18:13 +00004743 BTRFS_FILE_EXTENT_INLINE)
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004744 return 0;
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004745
4746 len = btrfs_file_extent_num_bytes(leaf, extent);
4747 /* Last extent goes beyond i_size, no need to log a hole. */
4748 if (key.offset + len > i_size)
4749 return 0;
4750 hole_start = key.offset + len;
4751 hole_size = i_size - hole_start;
4752 }
4753 btrfs_release_path(path);
4754
4755 /* Last extent ends at i_size. */
4756 if (hole_size == 0)
4757 return 0;
4758
Jeff Mahoney0b246af2016-06-22 18:54:23 -04004759 hole_size = ALIGN(hole_size, fs_info->sectorsize);
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01004760 ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
4761 hole_size, 0, hole_size, 0, 0, 0);
4762 return ret;
4763}
4764
Filipe Manana56f23fd2016-03-30 23:37:21 +01004765/*
4766 * When we are logging a new inode X, check if it doesn't have a reference that
4767 * matches the reference from some other inode Y created in a past transaction
4768 * and that was renamed in the current transaction. If we don't do this, then at
4769 * log replay time we can lose inode Y (and all its files if it's a directory):
4770 *
4771 * mkdir /mnt/x
4772 * echo "hello world" > /mnt/x/foobar
4773 * sync
4774 * mv /mnt/x /mnt/y
4775 * mkdir /mnt/x # or touch /mnt/x
4776 * xfs_io -c fsync /mnt/x
4777 * <power fail>
4778 * mount fs, trigger log replay
4779 *
4780 * After the log replay procedure, we would lose the first directory and all its
4781 * files (file foobar).
4782 * For the case where inode Y is not a directory we simply end up losing it:
4783 *
4784 * echo "123" > /mnt/foo
4785 * sync
4786 * mv /mnt/foo /mnt/bar
4787 * echo "abc" > /mnt/foo
4788 * xfs_io -c fsync /mnt/foo
4789 * <power fail>
4790 *
4791 * We also need this for cases where a snapshot entry is replaced by some other
4792 * entry (file or directory) otherwise we end up with an unreplayable log due to
4793 * attempts to delete the snapshot entry (entry of type BTRFS_ROOT_ITEM_KEY) as
4794 * if it were a regular entry:
4795 *
4796 * mkdir /mnt/x
4797 * btrfs subvolume snapshot /mnt /mnt/x/snap
4798 * btrfs subvolume delete /mnt/x/snap
4799 * rmdir /mnt/x
4800 * mkdir /mnt/x
4801 * fsync /mnt/x or fsync some new file inside it
4802 * <power fail>
4803 *
4804 * The snapshot delete, rmdir of x, mkdir of a new x and the fsync all happen in
4805 * the same transaction.
4806 */
4807static int btrfs_check_ref_name_override(struct extent_buffer *eb,
4808 const int slot,
4809 const struct btrfs_key *key,
Nikolay Borisov4791c8f2017-01-18 00:31:35 +02004810 struct btrfs_inode *inode,
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004811 u64 *other_ino, u64 *other_parent)
Filipe Manana56f23fd2016-03-30 23:37:21 +01004812{
4813 int ret;
4814 struct btrfs_path *search_path;
4815 char *name = NULL;
4816 u32 name_len = 0;
4817 u32 item_size = btrfs_item_size_nr(eb, slot);
4818 u32 cur_offset = 0;
4819 unsigned long ptr = btrfs_item_ptr_offset(eb, slot);
4820
4821 search_path = btrfs_alloc_path();
4822 if (!search_path)
4823 return -ENOMEM;
4824 search_path->search_commit_root = 1;
4825 search_path->skip_locking = 1;
4826
4827 while (cur_offset < item_size) {
4828 u64 parent;
4829 u32 this_name_len;
4830 u32 this_len;
4831 unsigned long name_ptr;
4832 struct btrfs_dir_item *di;
4833
4834 if (key->type == BTRFS_INODE_REF_KEY) {
4835 struct btrfs_inode_ref *iref;
4836
4837 iref = (struct btrfs_inode_ref *)(ptr + cur_offset);
4838 parent = key->offset;
4839 this_name_len = btrfs_inode_ref_name_len(eb, iref);
4840 name_ptr = (unsigned long)(iref + 1);
4841 this_len = sizeof(*iref) + this_name_len;
4842 } else {
4843 struct btrfs_inode_extref *extref;
4844
4845 extref = (struct btrfs_inode_extref *)(ptr +
4846 cur_offset);
4847 parent = btrfs_inode_extref_parent(eb, extref);
4848 this_name_len = btrfs_inode_extref_name_len(eb, extref);
4849 name_ptr = (unsigned long)&extref->name;
4850 this_len = sizeof(*extref) + this_name_len;
4851 }
4852
4853 if (this_name_len > name_len) {
4854 char *new_name;
4855
4856 new_name = krealloc(name, this_name_len, GFP_NOFS);
4857 if (!new_name) {
4858 ret = -ENOMEM;
4859 goto out;
4860 }
4861 name_len = this_name_len;
4862 name = new_name;
4863 }
4864
4865 read_extent_buffer(eb, name, name_ptr, this_name_len);
Nikolay Borisov4791c8f2017-01-18 00:31:35 +02004866 di = btrfs_lookup_dir_item(NULL, inode->root, search_path,
4867 parent, name, this_name_len, 0);
Filipe Manana56f23fd2016-03-30 23:37:21 +01004868 if (di && !IS_ERR(di)) {
Filipe Manana44f714d2016-06-06 16:11:13 +01004869 struct btrfs_key di_key;
4870
4871 btrfs_dir_item_key_to_cpu(search_path->nodes[0],
4872 di, &di_key);
4873 if (di_key.type == BTRFS_INODE_ITEM_KEY) {
Filipe Manana6b5fc432019-02-13 12:14:03 +00004874 if (di_key.objectid != key->objectid) {
4875 ret = 1;
4876 *other_ino = di_key.objectid;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004877 *other_parent = parent;
Filipe Manana6b5fc432019-02-13 12:14:03 +00004878 } else {
4879 ret = 0;
4880 }
Filipe Manana44f714d2016-06-06 16:11:13 +01004881 } else {
4882 ret = -EAGAIN;
4883 }
Filipe Manana56f23fd2016-03-30 23:37:21 +01004884 goto out;
4885 } else if (IS_ERR(di)) {
4886 ret = PTR_ERR(di);
4887 goto out;
4888 }
4889 btrfs_release_path(search_path);
4890
4891 cur_offset += this_len;
4892 }
4893 ret = 0;
4894out:
4895 btrfs_free_path(search_path);
4896 kfree(name);
4897 return ret;
4898}
4899
Filipe Manana6b5fc432019-02-13 12:14:03 +00004900struct btrfs_ino_list {
4901 u64 ino;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004902 u64 parent;
Filipe Manana6b5fc432019-02-13 12:14:03 +00004903 struct list_head list;
4904};
4905
4906static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
4907 struct btrfs_root *root,
4908 struct btrfs_path *path,
4909 struct btrfs_log_ctx *ctx,
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004910 u64 ino, u64 parent)
Filipe Manana6b5fc432019-02-13 12:14:03 +00004911{
4912 struct btrfs_ino_list *ino_elem;
4913 LIST_HEAD(inode_list);
4914 int ret = 0;
4915
4916 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
4917 if (!ino_elem)
4918 return -ENOMEM;
4919 ino_elem->ino = ino;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004920 ino_elem->parent = parent;
Filipe Manana6b5fc432019-02-13 12:14:03 +00004921 list_add_tail(&ino_elem->list, &inode_list);
4922
4923 while (!list_empty(&inode_list)) {
4924 struct btrfs_fs_info *fs_info = root->fs_info;
4925 struct btrfs_key key;
4926 struct inode *inode;
4927
4928 ino_elem = list_first_entry(&inode_list, struct btrfs_ino_list,
4929 list);
4930 ino = ino_elem->ino;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004931 parent = ino_elem->parent;
Filipe Manana6b5fc432019-02-13 12:14:03 +00004932 list_del(&ino_elem->list);
4933 kfree(ino_elem);
4934 if (ret)
4935 continue;
4936
4937 btrfs_release_path(path);
4938
4939 key.objectid = ino;
4940 key.type = BTRFS_INODE_ITEM_KEY;
4941 key.offset = 0;
4942 inode = btrfs_iget(fs_info->sb, &key, root, NULL);
4943 /*
4944 * If the other inode that had a conflicting dir entry was
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004945 * deleted in the current transaction, we need to log its parent
4946 * directory.
Filipe Manana6b5fc432019-02-13 12:14:03 +00004947 */
4948 if (IS_ERR(inode)) {
4949 ret = PTR_ERR(inode);
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004950 if (ret == -ENOENT) {
4951 key.objectid = parent;
4952 inode = btrfs_iget(fs_info->sb, &key, root,
4953 NULL);
4954 if (IS_ERR(inode)) {
4955 ret = PTR_ERR(inode);
4956 } else {
4957 ret = btrfs_log_inode(trans, root,
4958 BTRFS_I(inode),
4959 LOG_OTHER_INODE_ALL,
4960 0, LLONG_MAX, ctx);
4961 iput(inode);
4962 }
4963 }
Filipe Manana6b5fc432019-02-13 12:14:03 +00004964 continue;
4965 }
4966 /*
4967 * We are safe logging the other inode without acquiring its
4968 * lock as long as we log with the LOG_INODE_EXISTS mode. We
4969 * are safe against concurrent renames of the other inode as
4970 * well because during a rename we pin the log and update the
4971 * log with the new name before we unpin it.
4972 */
4973 ret = btrfs_log_inode(trans, root, BTRFS_I(inode),
4974 LOG_OTHER_INODE, 0, LLONG_MAX, ctx);
4975 if (ret) {
4976 iput(inode);
4977 continue;
4978 }
4979
4980 key.objectid = ino;
4981 key.type = BTRFS_INODE_REF_KEY;
4982 key.offset = 0;
4983 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4984 if (ret < 0) {
4985 iput(inode);
4986 continue;
4987 }
4988
4989 while (true) {
4990 struct extent_buffer *leaf = path->nodes[0];
4991 int slot = path->slots[0];
4992 u64 other_ino = 0;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00004993 u64 other_parent = 0;
Filipe Manana6b5fc432019-02-13 12:14:03 +00004994
4995 if (slot >= btrfs_header_nritems(leaf)) {
4996 ret = btrfs_next_leaf(root, path);
4997 if (ret < 0) {
4998 break;
4999 } else if (ret > 0) {
5000 ret = 0;
5001 break;
5002 }
5003 continue;
5004 }
5005
5006 btrfs_item_key_to_cpu(leaf, &key, slot);
5007 if (key.objectid != ino ||
5008 (key.type != BTRFS_INODE_REF_KEY &&
5009 key.type != BTRFS_INODE_EXTREF_KEY)) {
5010 ret = 0;
5011 break;
5012 }
5013
5014 ret = btrfs_check_ref_name_override(leaf, slot, &key,
Filipe Mananaa3baaf02019-02-13 12:14:09 +00005015 BTRFS_I(inode), &other_ino,
5016 &other_parent);
Filipe Manana6b5fc432019-02-13 12:14:03 +00005017 if (ret < 0)
5018 break;
5019 if (ret > 0) {
5020 ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS);
5021 if (!ino_elem) {
5022 ret = -ENOMEM;
5023 break;
5024 }
5025 ino_elem->ino = other_ino;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00005026 ino_elem->parent = other_parent;
Filipe Manana6b5fc432019-02-13 12:14:03 +00005027 list_add_tail(&ino_elem->list, &inode_list);
5028 ret = 0;
5029 }
5030 path->slots[0]++;
5031 }
5032 iput(inode);
5033 }
5034
5035 return ret;
5036}
5037
Chris Masone02119d2008-09-05 16:13:11 -04005038/* log a single inode in the tree log.
5039 * At least one parent directory for this inode must exist in the tree
5040 * or be logged already.
5041 *
5042 * Any items from this inode changed by the current transaction are copied
5043 * to the log tree. An extra reference is taken on any extents in this
5044 * file, allowing us to avoid a whole pile of corner cases around logging
5045 * blocks that have been removed from the tree.
5046 *
5047 * See LOG_INODE_ALL and related defines for a description of what inode_only
5048 * does.
5049 *
5050 * This handles both files and directories.
5051 */
Chris Mason12fcfd22009-03-24 10:24:20 -04005052static int btrfs_log_inode(struct btrfs_trans_handle *trans,
Nikolay Borisova59108a2017-01-18 00:31:48 +02005053 struct btrfs_root *root, struct btrfs_inode *inode,
Filipe Manana49dae1b2014-09-06 22:34:39 +01005054 int inode_only,
5055 const loff_t start,
Filipe Manana8407f552014-09-05 15:14:39 +01005056 const loff_t end,
5057 struct btrfs_log_ctx *ctx)
Chris Masone02119d2008-09-05 16:13:11 -04005058{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005059 struct btrfs_fs_info *fs_info = root->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -04005060 struct btrfs_path *path;
5061 struct btrfs_path *dst_path;
5062 struct btrfs_key min_key;
5063 struct btrfs_key max_key;
5064 struct btrfs_root *log = root->log_root;
Josef Bacik16e75492013-10-22 12:18:51 -04005065 u64 last_extent = 0;
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005066 int err = 0;
Chris Masone02119d2008-09-05 16:13:11 -04005067 int ret;
Chris Mason3a5f1d42008-09-11 15:53:37 -04005068 int nritems;
Chris Mason31ff1cd2008-09-11 16:17:57 -04005069 int ins_start_slot = 0;
5070 int ins_nr;
Josef Bacik5dc562c2012-08-17 13:14:17 -04005071 bool fast_search = false;
Nikolay Borisova59108a2017-01-18 00:31:48 +02005072 u64 ino = btrfs_ino(inode);
5073 struct extent_map_tree *em_tree = &inode->extent_tree;
Filipe Manana1a4bcf42015-02-13 12:30:56 +00005074 u64 logged_isize = 0;
Filipe Mananae4545de2015-06-17 12:49:23 +01005075 bool need_log_inode_item = true;
Filipe Manana9a8fca62018-05-11 16:42:42 +01005076 bool xattrs_logged = false;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00005077 bool recursive_logging = false;
Chris Masone02119d2008-09-05 16:13:11 -04005078
Chris Masone02119d2008-09-05 16:13:11 -04005079 path = btrfs_alloc_path();
Tsutomu Itoh5df67082011-02-01 09:17:35 +00005080 if (!path)
5081 return -ENOMEM;
Chris Masone02119d2008-09-05 16:13:11 -04005082 dst_path = btrfs_alloc_path();
Tsutomu Itoh5df67082011-02-01 09:17:35 +00005083 if (!dst_path) {
5084 btrfs_free_path(path);
5085 return -ENOMEM;
5086 }
Chris Masone02119d2008-09-05 16:13:11 -04005087
Li Zefan33345d012011-04-20 10:31:50 +08005088 min_key.objectid = ino;
Chris Masone02119d2008-09-05 16:13:11 -04005089 min_key.type = BTRFS_INODE_ITEM_KEY;
5090 min_key.offset = 0;
5091
Li Zefan33345d012011-04-20 10:31:50 +08005092 max_key.objectid = ino;
Chris Mason12fcfd22009-03-24 10:24:20 -04005093
Chris Mason12fcfd22009-03-24 10:24:20 -04005094
Josef Bacik5dc562c2012-08-17 13:14:17 -04005095 /* today the code can only do partial logging of directories */
Nikolay Borisova59108a2017-01-18 00:31:48 +02005096 if (S_ISDIR(inode->vfs_inode.i_mode) ||
Miao Xie5269b672012-11-01 07:35:23 +00005097 (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
Nikolay Borisova59108a2017-01-18 00:31:48 +02005098 &inode->runtime_flags) &&
Liu Bo781feef2016-11-30 16:20:25 -08005099 inode_only >= LOG_INODE_EXISTS))
Chris Masone02119d2008-09-05 16:13:11 -04005100 max_key.type = BTRFS_XATTR_ITEM_KEY;
5101 else
5102 max_key.type = (u8)-1;
5103 max_key.offset = (u64)-1;
5104
Filipe Manana2c2c4522015-01-13 16:40:04 +00005105 /*
5106 * Only run delayed items if we are a dir or a new file.
5107 * Otherwise commit the delayed inode only, which is needed in
5108 * order for the log replay code to mark inodes for link count
5109 * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
5110 */
Nikolay Borisova59108a2017-01-18 00:31:48 +02005111 if (S_ISDIR(inode->vfs_inode.i_mode) ||
5112 inode->generation > fs_info->last_trans_committed)
5113 ret = btrfs_commit_inode_delayed_items(trans, inode);
Filipe Manana2c2c4522015-01-13 16:40:04 +00005114 else
Nikolay Borisova59108a2017-01-18 00:31:48 +02005115 ret = btrfs_commit_inode_delayed_inode(inode);
Filipe Manana2c2c4522015-01-13 16:40:04 +00005116
5117 if (ret) {
5118 btrfs_free_path(path);
5119 btrfs_free_path(dst_path);
5120 return ret;
Miao Xie16cdcec2011-04-22 18:12:22 +08005121 }
5122
Filipe Mananaa3baaf02019-02-13 12:14:09 +00005123 if (inode_only == LOG_OTHER_INODE || inode_only == LOG_OTHER_INODE_ALL) {
5124 recursive_logging = true;
5125 if (inode_only == LOG_OTHER_INODE)
5126 inode_only = LOG_INODE_EXISTS;
5127 else
5128 inode_only = LOG_INODE_ALL;
Nikolay Borisova59108a2017-01-18 00:31:48 +02005129 mutex_lock_nested(&inode->log_mutex, SINGLE_DEPTH_NESTING);
Liu Bo781feef2016-11-30 16:20:25 -08005130 } else {
Nikolay Borisova59108a2017-01-18 00:31:48 +02005131 mutex_lock(&inode->log_mutex);
Liu Bo781feef2016-11-30 16:20:25 -08005132 }
Chris Masone02119d2008-09-05 16:13:11 -04005133
Filipe Manana5e33a2b2016-02-25 23:19:38 +00005134 /*
Chris Masone02119d2008-09-05 16:13:11 -04005135 * a brute force approach to making sure we get the most uptodate
5136 * copies of everything.
5137 */
Nikolay Borisova59108a2017-01-18 00:31:48 +02005138 if (S_ISDIR(inode->vfs_inode.i_mode)) {
Chris Masone02119d2008-09-05 16:13:11 -04005139 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
5140
Filipe Manana4f764e52015-02-23 19:53:35 +00005141 if (inode_only == LOG_INODE_EXISTS)
5142 max_key_type = BTRFS_XATTR_ITEM_KEY;
Li Zefan33345d012011-04-20 10:31:50 +08005143 ret = drop_objectid_items(trans, log, path, ino, max_key_type);
Chris Masone02119d2008-09-05 16:13:11 -04005144 } else {
Filipe Manana1a4bcf42015-02-13 12:30:56 +00005145 if (inode_only == LOG_INODE_EXISTS) {
5146 /*
5147 * Make sure the new inode item we write to the log has
5148 * the same isize as the current one (if it exists).
5149 * This is necessary to prevent data loss after log
5150 * replay, and also to prevent doing a wrong expanding
5151 * truncate - for e.g. create file, write 4K into offset
5152 * 0, fsync, write 4K into offset 4096, add hard link,
5153 * fsync some other file (to sync log), power fail - if
5154 * we use the inode's current i_size, after log replay
5155 * we get a 8Kb file, with the last 4Kb extent as a hole
5156 * (zeroes), as if an expanding truncate happened,
5157 * instead of getting a file of 4Kb only.
5158 */
Nikolay Borisova59108a2017-01-18 00:31:48 +02005159 err = logged_inode_size(log, inode, path, &logged_isize);
Filipe Manana1a4bcf42015-02-13 12:30:56 +00005160 if (err)
5161 goto out_unlock;
5162 }
Filipe Mananaa7429942015-02-13 16:56:14 +00005163 if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
Nikolay Borisova59108a2017-01-18 00:31:48 +02005164 &inode->runtime_flags)) {
Filipe Mananaa7429942015-02-13 16:56:14 +00005165 if (inode_only == LOG_INODE_EXISTS) {
Filipe Manana4f764e52015-02-23 19:53:35 +00005166 max_key.type = BTRFS_XATTR_ITEM_KEY;
Filipe Mananaa7429942015-02-13 16:56:14 +00005167 ret = drop_objectid_items(trans, log, path, ino,
5168 max_key.type);
5169 } else {
5170 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
Nikolay Borisova59108a2017-01-18 00:31:48 +02005171 &inode->runtime_flags);
Filipe Mananaa7429942015-02-13 16:56:14 +00005172 clear_bit(BTRFS_INODE_COPY_EVERYTHING,
Nikolay Borisova59108a2017-01-18 00:31:48 +02005173 &inode->runtime_flags);
Chris Mason28ed1342014-12-17 09:41:04 -08005174 while(1) {
5175 ret = btrfs_truncate_inode_items(trans,
Nikolay Borisova59108a2017-01-18 00:31:48 +02005176 log, &inode->vfs_inode, 0, 0);
Chris Mason28ed1342014-12-17 09:41:04 -08005177 if (ret != -EAGAIN)
5178 break;
5179 }
Filipe Mananaa7429942015-02-13 16:56:14 +00005180 }
Filipe Manana4f764e52015-02-23 19:53:35 +00005181 } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
Nikolay Borisova59108a2017-01-18 00:31:48 +02005182 &inode->runtime_flags) ||
Josef Bacik6cfab852013-11-12 16:25:58 -05005183 inode_only == LOG_INODE_EXISTS) {
Filipe Manana4f764e52015-02-23 19:53:35 +00005184 if (inode_only == LOG_INODE_ALL)
Josef Bacika95249b2012-10-11 16:17:34 -04005185 fast_search = true;
Filipe Manana4f764e52015-02-23 19:53:35 +00005186 max_key.type = BTRFS_XATTR_ITEM_KEY;
Josef Bacika95249b2012-10-11 16:17:34 -04005187 ret = drop_objectid_items(trans, log, path, ino,
5188 max_key.type);
Josef Bacik5dc562c2012-08-17 13:14:17 -04005189 } else {
Liu Bo183f37f2012-11-01 06:38:47 +00005190 if (inode_only == LOG_INODE_ALL)
5191 fast_search = true;
Josef Bacika95249b2012-10-11 16:17:34 -04005192 goto log_extents;
Josef Bacik5dc562c2012-08-17 13:14:17 -04005193 }
Josef Bacika95249b2012-10-11 16:17:34 -04005194
Chris Masone02119d2008-09-05 16:13:11 -04005195 }
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005196 if (ret) {
5197 err = ret;
5198 goto out_unlock;
5199 }
Chris Masone02119d2008-09-05 16:13:11 -04005200
Chris Masond3977122009-01-05 21:25:51 -05005201 while (1) {
Chris Mason31ff1cd2008-09-11 16:17:57 -04005202 ins_nr = 0;
Filipe David Borba Manana6174d3c2013-10-01 16:13:42 +01005203 ret = btrfs_search_forward(root, &min_key,
Eric Sandeende78b512013-01-31 18:21:12 +00005204 path, trans->transid);
Liu Bofb770ae2016-07-05 12:10:14 -07005205 if (ret < 0) {
5206 err = ret;
5207 goto out_unlock;
5208 }
Chris Masone02119d2008-09-05 16:13:11 -04005209 if (ret != 0)
5210 break;
Chris Mason3a5f1d42008-09-11 15:53:37 -04005211again:
Chris Mason31ff1cd2008-09-11 16:17:57 -04005212 /* note, ins_nr might be > 0 here, cleanup outside the loop */
Li Zefan33345d012011-04-20 10:31:50 +08005213 if (min_key.objectid != ino)
Chris Masone02119d2008-09-05 16:13:11 -04005214 break;
5215 if (min_key.type > max_key.type)
5216 break;
Chris Mason31ff1cd2008-09-11 16:17:57 -04005217
Filipe Mananae4545de2015-06-17 12:49:23 +01005218 if (min_key.type == BTRFS_INODE_ITEM_KEY)
5219 need_log_inode_item = false;
5220
Filipe Manana56f23fd2016-03-30 23:37:21 +01005221 if ((min_key.type == BTRFS_INODE_REF_KEY ||
5222 min_key.type == BTRFS_INODE_EXTREF_KEY) &&
Filipe Manana6b5fc432019-02-13 12:14:03 +00005223 inode->generation == trans->transid &&
5224 !recursive_logging) {
Filipe Manana44f714d2016-06-06 16:11:13 +01005225 u64 other_ino = 0;
Filipe Mananaa3baaf02019-02-13 12:14:09 +00005226 u64 other_parent = 0;
Filipe Manana44f714d2016-06-06 16:11:13 +01005227
Filipe Manana56f23fd2016-03-30 23:37:21 +01005228 ret = btrfs_check_ref_name_override(path->nodes[0],
Nikolay Borisova59108a2017-01-18 00:31:48 +02005229 path->slots[0], &min_key, inode,
Filipe Mananaa3baaf02019-02-13 12:14:09 +00005230 &other_ino, &other_parent);
Filipe Manana56f23fd2016-03-30 23:37:21 +01005231 if (ret < 0) {
5232 err = ret;
5233 goto out_unlock;
Filipe Manana28a23592016-08-23 21:13:51 +01005234 } else if (ret > 0 && ctx &&
Nikolay Borisov4a0cc7c2017-01-10 20:35:31 +02005235 other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
Filipe Manana44f714d2016-06-06 16:11:13 +01005236 if (ins_nr > 0) {
5237 ins_nr++;
5238 } else {
5239 ins_nr = 1;
5240 ins_start_slot = path->slots[0];
5241 }
Nikolay Borisova59108a2017-01-18 00:31:48 +02005242 ret = copy_items(trans, inode, dst_path, path,
Filipe Manana44f714d2016-06-06 16:11:13 +01005243 &last_extent, ins_start_slot,
5244 ins_nr, inode_only,
5245 logged_isize);
5246 if (ret < 0) {
5247 err = ret;
5248 goto out_unlock;
5249 }
5250 ins_nr = 0;
Filipe Manana6b5fc432019-02-13 12:14:03 +00005251
5252 err = log_conflicting_inodes(trans, root, path,
Filipe Mananaa3baaf02019-02-13 12:14:09 +00005253 ctx, other_ino, other_parent);
Filipe Manana44f714d2016-06-06 16:11:13 +01005254 if (err)
5255 goto out_unlock;
Filipe Manana6b5fc432019-02-13 12:14:03 +00005256 btrfs_release_path(path);
5257 goto next_key;
Filipe Manana56f23fd2016-03-30 23:37:21 +01005258 }
5259 }
5260
Filipe Manana36283bf2015-06-20 00:44:51 +01005261 /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
5262 if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
5263 if (ins_nr == 0)
5264 goto next_slot;
Nikolay Borisova59108a2017-01-18 00:31:48 +02005265 ret = copy_items(trans, inode, dst_path, path,
Filipe Manana36283bf2015-06-20 00:44:51 +01005266 &last_extent, ins_start_slot,
5267 ins_nr, inode_only, logged_isize);
5268 if (ret < 0) {
5269 err = ret;
5270 goto out_unlock;
5271 }
5272 ins_nr = 0;
5273 if (ret) {
5274 btrfs_release_path(path);
5275 continue;
5276 }
5277 goto next_slot;
5278 }
5279
Chris Mason31ff1cd2008-09-11 16:17:57 -04005280 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
5281 ins_nr++;
5282 goto next_slot;
5283 } else if (!ins_nr) {
5284 ins_start_slot = path->slots[0];
5285 ins_nr = 1;
5286 goto next_slot;
Chris Masone02119d2008-09-05 16:13:11 -04005287 }
5288
Nikolay Borisova59108a2017-01-18 00:31:48 +02005289 ret = copy_items(trans, inode, dst_path, path, &last_extent,
Filipe Manana1a4bcf42015-02-13 12:30:56 +00005290 ins_start_slot, ins_nr, inode_only,
5291 logged_isize);
Josef Bacik16e75492013-10-22 12:18:51 -04005292 if (ret < 0) {
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005293 err = ret;
5294 goto out_unlock;
Rasmus Villemoesa71db862014-06-20 21:51:43 +02005295 }
5296 if (ret) {
Josef Bacik16e75492013-10-22 12:18:51 -04005297 ins_nr = 0;
5298 btrfs_release_path(path);
5299 continue;
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005300 }
Chris Mason31ff1cd2008-09-11 16:17:57 -04005301 ins_nr = 1;
5302 ins_start_slot = path->slots[0];
5303next_slot:
Chris Masone02119d2008-09-05 16:13:11 -04005304
Chris Mason3a5f1d42008-09-11 15:53:37 -04005305 nritems = btrfs_header_nritems(path->nodes[0]);
5306 path->slots[0]++;
5307 if (path->slots[0] < nritems) {
5308 btrfs_item_key_to_cpu(path->nodes[0], &min_key,
5309 path->slots[0]);
5310 goto again;
5311 }
Chris Mason31ff1cd2008-09-11 16:17:57 -04005312 if (ins_nr) {
Nikolay Borisova59108a2017-01-18 00:31:48 +02005313 ret = copy_items(trans, inode, dst_path, path,
Josef Bacik16e75492013-10-22 12:18:51 -04005314 &last_extent, ins_start_slot,
Filipe Manana1a4bcf42015-02-13 12:30:56 +00005315 ins_nr, inode_only, logged_isize);
Josef Bacik16e75492013-10-22 12:18:51 -04005316 if (ret < 0) {
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005317 err = ret;
5318 goto out_unlock;
5319 }
Josef Bacik16e75492013-10-22 12:18:51 -04005320 ret = 0;
Chris Mason31ff1cd2008-09-11 16:17:57 -04005321 ins_nr = 0;
5322 }
David Sterbab3b4aa72011-04-21 01:20:15 +02005323 btrfs_release_path(path);
Filipe Manana44f714d2016-06-06 16:11:13 +01005324next_key:
Filipe David Borba Manana3d41d702013-10-01 17:06:53 +01005325 if (min_key.offset < (u64)-1) {
Chris Masone02119d2008-09-05 16:13:11 -04005326 min_key.offset++;
Filipe David Borba Manana3d41d702013-10-01 17:06:53 +01005327 } else if (min_key.type < max_key.type) {
Chris Masone02119d2008-09-05 16:13:11 -04005328 min_key.type++;
Filipe David Borba Manana3d41d702013-10-01 17:06:53 +01005329 min_key.offset = 0;
5330 } else {
Chris Masone02119d2008-09-05 16:13:11 -04005331 break;
Filipe David Borba Manana3d41d702013-10-01 17:06:53 +01005332 }
Chris Masone02119d2008-09-05 16:13:11 -04005333 }
Chris Mason31ff1cd2008-09-11 16:17:57 -04005334 if (ins_nr) {
Nikolay Borisova59108a2017-01-18 00:31:48 +02005335 ret = copy_items(trans, inode, dst_path, path, &last_extent,
Filipe Manana1a4bcf42015-02-13 12:30:56 +00005336 ins_start_slot, ins_nr, inode_only,
5337 logged_isize);
Josef Bacik16e75492013-10-22 12:18:51 -04005338 if (ret < 0) {
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005339 err = ret;
5340 goto out_unlock;
5341 }
Josef Bacik16e75492013-10-22 12:18:51 -04005342 ret = 0;
Chris Mason31ff1cd2008-09-11 16:17:57 -04005343 ins_nr = 0;
5344 }
Josef Bacik5dc562c2012-08-17 13:14:17 -04005345
Filipe Manana36283bf2015-06-20 00:44:51 +01005346 btrfs_release_path(path);
5347 btrfs_release_path(dst_path);
Nikolay Borisova59108a2017-01-18 00:31:48 +02005348 err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path);
Filipe Manana36283bf2015-06-20 00:44:51 +01005349 if (err)
5350 goto out_unlock;
Filipe Manana9a8fca62018-05-11 16:42:42 +01005351 xattrs_logged = true;
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01005352 if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
5353 btrfs_release_path(path);
5354 btrfs_release_path(dst_path);
Nikolay Borisova59108a2017-01-18 00:31:48 +02005355 err = btrfs_log_trailing_hole(trans, root, inode, path);
Filipe Mananaa89ca6f2015-06-25 04:17:46 +01005356 if (err)
5357 goto out_unlock;
5358 }
Josef Bacika95249b2012-10-11 16:17:34 -04005359log_extents:
Josef Bacikf3b15cc2013-07-22 12:54:30 -04005360 btrfs_release_path(path);
5361 btrfs_release_path(dst_path);
Filipe Mananae4545de2015-06-17 12:49:23 +01005362 if (need_log_inode_item) {
Nikolay Borisova59108a2017-01-18 00:31:48 +02005363 err = log_inode_item(trans, log, dst_path, inode);
Filipe Manana9a8fca62018-05-11 16:42:42 +01005364 if (!err && !xattrs_logged) {
5365 err = btrfs_log_all_xattrs(trans, root, inode, path,
5366 dst_path);
5367 btrfs_release_path(path);
5368 }
Filipe Mananae4545de2015-06-17 12:49:23 +01005369 if (err)
5370 goto out_unlock;
5371 }
Josef Bacik5dc562c2012-08-17 13:14:17 -04005372 if (fast_search) {
Nikolay Borisova59108a2017-01-18 00:31:48 +02005373 ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
Josef Bacika2120a42018-05-23 11:58:35 -04005374 ctx, start, end);
Josef Bacik5dc562c2012-08-17 13:14:17 -04005375 if (ret) {
5376 err = ret;
5377 goto out_unlock;
5378 }
Josef Bacikd006a042013-11-12 20:54:09 -05005379 } else if (inode_only == LOG_INODE_ALL) {
Liu Bo06d3d222012-08-27 10:52:19 -06005380 struct extent_map *em, *n;
5381
Filipe Manana49dae1b2014-09-06 22:34:39 +01005382 write_lock(&em_tree->lock);
5383 /*
5384 * We can't just remove every em if we're called for a ranged
5385 * fsync - that is, one that doesn't cover the whole possible
5386 * file range (0 to LLONG_MAX). This is because we can have
5387 * em's that fall outside the range we're logging and therefore
5388 * their ordered operations haven't completed yet
5389 * (btrfs_finish_ordered_io() not invoked yet). This means we
5390 * didn't get their respective file extent item in the fs/subvol
5391 * tree yet, and need to let the next fast fsync (one which
5392 * consults the list of modified extent maps) find the em so
5393 * that it logs a matching file extent item and waits for the
5394 * respective ordered operation to complete (if it's still
5395 * running).
5396 *
5397 * Removing every em outside the range we're logging would make
5398 * the next fast fsync not log their matching file extent items,
5399 * therefore making us lose data after a log replay.
5400 */
5401 list_for_each_entry_safe(em, n, &em_tree->modified_extents,
5402 list) {
5403 const u64 mod_end = em->mod_start + em->mod_len - 1;
5404
5405 if (em->mod_start >= start && mod_end <= end)
5406 list_del_init(&em->list);
5407 }
5408 write_unlock(&em_tree->lock);
Josef Bacik5dc562c2012-08-17 13:14:17 -04005409 }
5410
Nikolay Borisova59108a2017-01-18 00:31:48 +02005411 if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->vfs_inode.i_mode)) {
5412 ret = log_directory_changes(trans, root, inode, path, dst_path,
5413 ctx);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005414 if (ret) {
5415 err = ret;
5416 goto out_unlock;
5417 }
Chris Masone02119d2008-09-05 16:13:11 -04005418 }
Filipe Manana49dae1b2014-09-06 22:34:39 +01005419
Nikolay Borisova59108a2017-01-18 00:31:48 +02005420 spin_lock(&inode->lock);
5421 inode->logged_trans = trans->transid;
5422 inode->last_log_commit = inode->last_sub_trans;
5423 spin_unlock(&inode->lock);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005424out_unlock:
Nikolay Borisova59108a2017-01-18 00:31:48 +02005425 mutex_unlock(&inode->log_mutex);
Chris Masone02119d2008-09-05 16:13:11 -04005426
5427 btrfs_free_path(path);
5428 btrfs_free_path(dst_path);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005429 return err;
Chris Masone02119d2008-09-05 16:13:11 -04005430}
5431
Chris Mason12fcfd22009-03-24 10:24:20 -04005432/*
Filipe Manana2be63d52016-02-12 11:34:23 +00005433 * Check if we must fallback to a transaction commit when logging an inode.
5434 * This must be called after logging the inode and is used only in the context
5435 * when fsyncing an inode requires the need to log some other inode - in which
5436 * case we can't lock the i_mutex of each other inode we need to log as that
5437 * can lead to deadlocks with concurrent fsync against other inodes (as we can
5438 * log inodes up or down in the hierarchy) or rename operations for example. So
5439 * we take the log_mutex of the inode after we have logged it and then check for
5440 * its last_unlink_trans value - this is safe because any task setting
5441 * last_unlink_trans must take the log_mutex and it must do this before it does
5442 * the actual unlink operation, so if we do this check before a concurrent task
5443 * sets last_unlink_trans it means we've logged a consistent version/state of
5444 * all the inode items, otherwise we are not sure and must do a transaction
Nicholas D Steeves01327612016-05-19 21:18:45 -04005445 * commit (the concurrent task might have only updated last_unlink_trans before
Filipe Manana2be63d52016-02-12 11:34:23 +00005446 * we logged the inode or it might have also done the unlink).
5447 */
5448static bool btrfs_must_commit_transaction(struct btrfs_trans_handle *trans,
Nikolay Borisovab1717b2017-01-18 00:31:27 +02005449 struct btrfs_inode *inode)
Filipe Manana2be63d52016-02-12 11:34:23 +00005450{
Nikolay Borisovab1717b2017-01-18 00:31:27 +02005451 struct btrfs_fs_info *fs_info = inode->root->fs_info;
Filipe Manana2be63d52016-02-12 11:34:23 +00005452 bool ret = false;
5453
Nikolay Borisovab1717b2017-01-18 00:31:27 +02005454 mutex_lock(&inode->log_mutex);
5455 if (inode->last_unlink_trans > fs_info->last_trans_committed) {
Filipe Manana2be63d52016-02-12 11:34:23 +00005456 /*
5457 * Make sure any commits to the log are forced to be full
5458 * commits.
5459 */
David Sterba90787762019-03-20 13:28:05 +01005460 btrfs_set_log_full_commit(trans);
Filipe Manana2be63d52016-02-12 11:34:23 +00005461 ret = true;
5462 }
Nikolay Borisovab1717b2017-01-18 00:31:27 +02005463 mutex_unlock(&inode->log_mutex);
Filipe Manana2be63d52016-02-12 11:34:23 +00005464
5465 return ret;
5466}
5467
5468/*
Chris Mason12fcfd22009-03-24 10:24:20 -04005469 * follow the dentry parent pointers up the chain and see if any
5470 * of the directories in it require a full commit before they can
5471 * be logged. Returns zero if nothing special needs to be done or 1 if
5472 * a full commit is required.
5473 */
5474static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005475 struct btrfs_inode *inode,
Chris Mason12fcfd22009-03-24 10:24:20 -04005476 struct dentry *parent,
5477 struct super_block *sb,
5478 u64 last_committed)
Chris Masone02119d2008-09-05 16:13:11 -04005479{
Chris Mason12fcfd22009-03-24 10:24:20 -04005480 int ret = 0;
Josef Bacik6a912212010-11-20 09:48:00 +00005481 struct dentry *old_parent = NULL;
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005482 struct btrfs_inode *orig_inode = inode;
Chris Masone02119d2008-09-05 16:13:11 -04005483
Chris Masonaf4176b2009-03-24 10:24:31 -04005484 /*
5485 * for regular files, if its inode is already on disk, we don't
5486 * have to worry about the parents at all. This is because
5487 * we can use the last_unlink_trans field to record renames
5488 * and other fun in this file.
5489 */
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005490 if (S_ISREG(inode->vfs_inode.i_mode) &&
5491 inode->generation <= last_committed &&
5492 inode->last_unlink_trans <= last_committed)
5493 goto out;
Chris Masonaf4176b2009-03-24 10:24:31 -04005494
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005495 if (!S_ISDIR(inode->vfs_inode.i_mode)) {
Al Virofc640052016-04-10 01:33:30 -04005496 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
Chris Mason12fcfd22009-03-24 10:24:20 -04005497 goto out;
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005498 inode = BTRFS_I(d_inode(parent));
Chris Mason12fcfd22009-03-24 10:24:20 -04005499 }
5500
5501 while (1) {
Josef Bacikde2b5302013-09-11 09:36:30 -04005502 /*
5503 * If we are logging a directory then we start with our inode,
Nicholas D Steeves01327612016-05-19 21:18:45 -04005504 * not our parent's inode, so we need to skip setting the
Josef Bacikde2b5302013-09-11 09:36:30 -04005505 * logged_trans so that further down in the log code we don't
5506 * think this inode has already been logged.
5507 */
5508 if (inode != orig_inode)
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005509 inode->logged_trans = trans->transid;
Chris Mason12fcfd22009-03-24 10:24:20 -04005510 smp_mb();
5511
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005512 if (btrfs_must_commit_transaction(trans, inode)) {
Chris Mason12fcfd22009-03-24 10:24:20 -04005513 ret = 1;
5514 break;
5515 }
5516
Al Virofc640052016-04-10 01:33:30 -04005517 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
Chris Mason12fcfd22009-03-24 10:24:20 -04005518 break;
5519
Filipe Manana44f714d2016-06-06 16:11:13 +01005520 if (IS_ROOT(parent)) {
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005521 inode = BTRFS_I(d_inode(parent));
5522 if (btrfs_must_commit_transaction(trans, inode))
Filipe Manana44f714d2016-06-06 16:11:13 +01005523 ret = 1;
Chris Mason12fcfd22009-03-24 10:24:20 -04005524 break;
Filipe Manana44f714d2016-06-06 16:11:13 +01005525 }
Chris Mason12fcfd22009-03-24 10:24:20 -04005526
Josef Bacik6a912212010-11-20 09:48:00 +00005527 parent = dget_parent(parent);
5528 dput(old_parent);
5529 old_parent = parent;
Nikolay Borisovaefa6112017-02-20 13:51:00 +02005530 inode = BTRFS_I(d_inode(parent));
Chris Mason12fcfd22009-03-24 10:24:20 -04005531
5532 }
Josef Bacik6a912212010-11-20 09:48:00 +00005533 dput(old_parent);
Chris Mason12fcfd22009-03-24 10:24:20 -04005534out:
Chris Masone02119d2008-09-05 16:13:11 -04005535 return ret;
5536}
5537
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005538struct btrfs_dir_list {
5539 u64 ino;
5540 struct list_head list;
5541};
5542
5543/*
5544 * Log the inodes of the new dentries of a directory. See log_dir_items() for
5545 * details about the why it is needed.
5546 * This is a recursive operation - if an existing dentry corresponds to a
5547 * directory, that directory's new entries are logged too (same behaviour as
5548 * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes
5549 * the dentries point to we do not lock their i_mutex, otherwise lockdep
5550 * complains about the following circular lock dependency / possible deadlock:
5551 *
5552 * CPU0 CPU1
5553 * ---- ----
5554 * lock(&type->i_mutex_dir_key#3/2);
5555 * lock(sb_internal#2);
5556 * lock(&type->i_mutex_dir_key#3/2);
5557 * lock(&sb->s_type->i_mutex_key#14);
5558 *
5559 * Where sb_internal is the lock (a counter that works as a lock) acquired by
5560 * sb_start_intwrite() in btrfs_start_transaction().
5561 * Not locking i_mutex of the inodes is still safe because:
5562 *
5563 * 1) For regular files we log with a mode of LOG_INODE_EXISTS. It's possible
5564 * that while logging the inode new references (names) are added or removed
5565 * from the inode, leaving the logged inode item with a link count that does
5566 * not match the number of logged inode reference items. This is fine because
5567 * at log replay time we compute the real number of links and correct the
5568 * link count in the inode item (see replay_one_buffer() and
5569 * link_to_fixup_dir());
5570 *
5571 * 2) For directories we log with a mode of LOG_INODE_ALL. It's possible that
5572 * while logging the inode's items new items with keys BTRFS_DIR_ITEM_KEY and
5573 * BTRFS_DIR_INDEX_KEY are added to fs/subvol tree and the logged inode item
5574 * has a size that doesn't match the sum of the lengths of all the logged
5575 * names. This does not result in a problem because if a dir_item key is
5576 * logged but its matching dir_index key is not logged, at log replay time we
5577 * don't use it to replay the respective name (see replay_one_name()). On the
5578 * other hand if only the dir_index key ends up being logged, the respective
5579 * name is added to the fs/subvol tree with both the dir_item and dir_index
5580 * keys created (see replay_one_name()).
5581 * The directory's inode item with a wrong i_size is not a problem as well,
5582 * since we don't use it at log replay time to set the i_size in the inode
5583 * item of the fs/subvol tree (see overwrite_item()).
5584 */
5585static int log_new_dir_dentries(struct btrfs_trans_handle *trans,
5586 struct btrfs_root *root,
Nikolay Borisov51cc0d32017-01-18 00:31:43 +02005587 struct btrfs_inode *start_inode,
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005588 struct btrfs_log_ctx *ctx)
5589{
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005590 struct btrfs_fs_info *fs_info = root->fs_info;
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005591 struct btrfs_root *log = root->log_root;
5592 struct btrfs_path *path;
5593 LIST_HEAD(dir_list);
5594 struct btrfs_dir_list *dir_elem;
5595 int ret = 0;
5596
5597 path = btrfs_alloc_path();
5598 if (!path)
5599 return -ENOMEM;
5600
5601 dir_elem = kmalloc(sizeof(*dir_elem), GFP_NOFS);
5602 if (!dir_elem) {
5603 btrfs_free_path(path);
5604 return -ENOMEM;
5605 }
Nikolay Borisov51cc0d32017-01-18 00:31:43 +02005606 dir_elem->ino = btrfs_ino(start_inode);
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005607 list_add_tail(&dir_elem->list, &dir_list);
5608
5609 while (!list_empty(&dir_list)) {
5610 struct extent_buffer *leaf;
5611 struct btrfs_key min_key;
5612 int nritems;
5613 int i;
5614
5615 dir_elem = list_first_entry(&dir_list, struct btrfs_dir_list,
5616 list);
5617 if (ret)
5618 goto next_dir_inode;
5619
5620 min_key.objectid = dir_elem->ino;
5621 min_key.type = BTRFS_DIR_ITEM_KEY;
5622 min_key.offset = 0;
5623again:
5624 btrfs_release_path(path);
5625 ret = btrfs_search_forward(log, &min_key, path, trans->transid);
5626 if (ret < 0) {
5627 goto next_dir_inode;
5628 } else if (ret > 0) {
5629 ret = 0;
5630 goto next_dir_inode;
5631 }
5632
5633process_leaf:
5634 leaf = path->nodes[0];
5635 nritems = btrfs_header_nritems(leaf);
5636 for (i = path->slots[0]; i < nritems; i++) {
5637 struct btrfs_dir_item *di;
5638 struct btrfs_key di_key;
5639 struct inode *di_inode;
5640 struct btrfs_dir_list *new_dir_elem;
5641 int log_mode = LOG_INODE_EXISTS;
5642 int type;
5643
5644 btrfs_item_key_to_cpu(leaf, &min_key, i);
5645 if (min_key.objectid != dir_elem->ino ||
5646 min_key.type != BTRFS_DIR_ITEM_KEY)
5647 goto next_dir_inode;
5648
5649 di = btrfs_item_ptr(leaf, i, struct btrfs_dir_item);
5650 type = btrfs_dir_type(leaf, di);
5651 if (btrfs_dir_transid(leaf, di) < trans->transid &&
5652 type != BTRFS_FT_DIR)
5653 continue;
5654 btrfs_dir_item_key_to_cpu(leaf, di, &di_key);
5655 if (di_key.type == BTRFS_ROOT_ITEM_KEY)
5656 continue;
5657
Robbie Koec125cf2016-10-28 10:48:26 +08005658 btrfs_release_path(path);
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005659 di_inode = btrfs_iget(fs_info->sb, &di_key, root, NULL);
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005660 if (IS_ERR(di_inode)) {
5661 ret = PTR_ERR(di_inode);
5662 goto next_dir_inode;
5663 }
5664
Nikolay Borisov0f8939b2017-01-18 00:31:30 +02005665 if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005666 iput(di_inode);
Robbie Koec125cf2016-10-28 10:48:26 +08005667 break;
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005668 }
5669
5670 ctx->log_new_dentries = false;
Filipe Manana3f9749f2016-04-25 04:45:02 +01005671 if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005672 log_mode = LOG_INODE_ALL;
Nikolay Borisova59108a2017-01-18 00:31:48 +02005673 ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005674 log_mode, 0, LLONG_MAX, ctx);
Filipe Manana2be63d52016-02-12 11:34:23 +00005675 if (!ret &&
Nikolay Borisovab1717b2017-01-18 00:31:27 +02005676 btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
Filipe Manana2be63d52016-02-12 11:34:23 +00005677 ret = 1;
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005678 iput(di_inode);
5679 if (ret)
5680 goto next_dir_inode;
5681 if (ctx->log_new_dentries) {
5682 new_dir_elem = kmalloc(sizeof(*new_dir_elem),
5683 GFP_NOFS);
5684 if (!new_dir_elem) {
5685 ret = -ENOMEM;
5686 goto next_dir_inode;
5687 }
5688 new_dir_elem->ino = di_key.objectid;
5689 list_add_tail(&new_dir_elem->list, &dir_list);
5690 }
5691 break;
5692 }
5693 if (i == nritems) {
5694 ret = btrfs_next_leaf(log, path);
5695 if (ret < 0) {
5696 goto next_dir_inode;
5697 } else if (ret > 0) {
5698 ret = 0;
5699 goto next_dir_inode;
5700 }
5701 goto process_leaf;
5702 }
5703 if (min_key.offset < (u64)-1) {
5704 min_key.offset++;
5705 goto again;
5706 }
5707next_dir_inode:
5708 list_del(&dir_elem->list);
5709 kfree(dir_elem);
5710 }
5711
5712 btrfs_free_path(path);
5713 return ret;
5714}
5715
Filipe Manana18aa0922015-08-05 16:49:08 +01005716static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
Nikolay Borisovd0a0b782017-02-20 13:50:30 +02005717 struct btrfs_inode *inode,
Filipe Manana18aa0922015-08-05 16:49:08 +01005718 struct btrfs_log_ctx *ctx)
5719{
David Sterba3ffbd682018-06-29 10:56:42 +02005720 struct btrfs_fs_info *fs_info = trans->fs_info;
Filipe Manana18aa0922015-08-05 16:49:08 +01005721 int ret;
5722 struct btrfs_path *path;
5723 struct btrfs_key key;
Nikolay Borisovd0a0b782017-02-20 13:50:30 +02005724 struct btrfs_root *root = inode->root;
5725 const u64 ino = btrfs_ino(inode);
Filipe Manana18aa0922015-08-05 16:49:08 +01005726
5727 path = btrfs_alloc_path();
5728 if (!path)
5729 return -ENOMEM;
5730 path->skip_locking = 1;
5731 path->search_commit_root = 1;
5732
5733 key.objectid = ino;
5734 key.type = BTRFS_INODE_REF_KEY;
5735 key.offset = 0;
5736 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5737 if (ret < 0)
5738 goto out;
5739
5740 while (true) {
5741 struct extent_buffer *leaf = path->nodes[0];
5742 int slot = path->slots[0];
5743 u32 cur_offset = 0;
5744 u32 item_size;
5745 unsigned long ptr;
5746
5747 if (slot >= btrfs_header_nritems(leaf)) {
5748 ret = btrfs_next_leaf(root, path);
5749 if (ret < 0)
5750 goto out;
5751 else if (ret > 0)
5752 break;
5753 continue;
5754 }
5755
5756 btrfs_item_key_to_cpu(leaf, &key, slot);
5757 /* BTRFS_INODE_EXTREF_KEY is BTRFS_INODE_REF_KEY + 1 */
5758 if (key.objectid != ino || key.type > BTRFS_INODE_EXTREF_KEY)
5759 break;
5760
5761 item_size = btrfs_item_size_nr(leaf, slot);
5762 ptr = btrfs_item_ptr_offset(leaf, slot);
5763 while (cur_offset < item_size) {
5764 struct btrfs_key inode_key;
5765 struct inode *dir_inode;
5766
5767 inode_key.type = BTRFS_INODE_ITEM_KEY;
5768 inode_key.offset = 0;
5769
5770 if (key.type == BTRFS_INODE_EXTREF_KEY) {
5771 struct btrfs_inode_extref *extref;
5772
5773 extref = (struct btrfs_inode_extref *)
5774 (ptr + cur_offset);
5775 inode_key.objectid = btrfs_inode_extref_parent(
5776 leaf, extref);
5777 cur_offset += sizeof(*extref);
5778 cur_offset += btrfs_inode_extref_name_len(leaf,
5779 extref);
5780 } else {
5781 inode_key.objectid = key.offset;
5782 cur_offset = item_size;
5783 }
5784
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005785 dir_inode = btrfs_iget(fs_info->sb, &inode_key,
Filipe Manana18aa0922015-08-05 16:49:08 +01005786 root, NULL);
Filipe Manana0f375ee2018-10-09 15:05:29 +01005787 /*
5788 * If the parent inode was deleted, return an error to
5789 * fallback to a transaction commit. This is to prevent
5790 * getting an inode that was moved from one parent A to
5791 * a parent B, got its former parent A deleted and then
5792 * it got fsync'ed, from existing at both parents after
5793 * a log replay (and the old parent still existing).
5794 * Example:
5795 *
5796 * mkdir /mnt/A
5797 * mkdir /mnt/B
5798 * touch /mnt/B/bar
5799 * sync
5800 * mv /mnt/B/bar /mnt/A/bar
5801 * mv -T /mnt/A /mnt/B
5802 * fsync /mnt/B/bar
5803 * <power fail>
5804 *
5805 * If we ignore the old parent B which got deleted,
5806 * after a log replay we would have file bar linked
5807 * at both parents and the old parent B would still
5808 * exist.
5809 */
5810 if (IS_ERR(dir_inode)) {
5811 ret = PTR_ERR(dir_inode);
5812 goto out;
5813 }
Filipe Manana18aa0922015-08-05 16:49:08 +01005814
Filipe Manana657ed1a2016-04-06 17:11:56 +01005815 if (ctx)
5816 ctx->log_new_dentries = false;
Nikolay Borisova59108a2017-01-18 00:31:48 +02005817 ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode),
Filipe Manana18aa0922015-08-05 16:49:08 +01005818 LOG_INODE_ALL, 0, LLONG_MAX, ctx);
Filipe Manana2be63d52016-02-12 11:34:23 +00005819 if (!ret &&
Nikolay Borisovab1717b2017-01-18 00:31:27 +02005820 btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode)))
Filipe Manana2be63d52016-02-12 11:34:23 +00005821 ret = 1;
Filipe Manana657ed1a2016-04-06 17:11:56 +01005822 if (!ret && ctx && ctx->log_new_dentries)
5823 ret = log_new_dir_dentries(trans, root,
David Sterbaf85b7372017-01-20 14:54:07 +01005824 BTRFS_I(dir_inode), ctx);
Filipe Manana18aa0922015-08-05 16:49:08 +01005825 iput(dir_inode);
5826 if (ret)
5827 goto out;
5828 }
5829 path->slots[0]++;
5830 }
5831 ret = 0;
5832out:
5833 btrfs_free_path(path);
5834 return ret;
5835}
5836
Chris Masone02119d2008-09-05 16:13:11 -04005837/*
5838 * helper function around btrfs_log_inode to make sure newly created
5839 * parent directories also end up in the log. A minimal inode and backref
5840 * only logging is done of any parent directories that are older than
5841 * the last committed transaction
5842 */
Eric Sandeen48a3b632013-04-25 20:41:01 +00005843static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005844 struct btrfs_inode *inode,
Filipe Manana49dae1b2014-09-06 22:34:39 +01005845 struct dentry *parent,
5846 const loff_t start,
5847 const loff_t end,
Edmund Nadolski41a1ead2017-11-20 13:24:47 -07005848 int inode_only,
Miao Xie8b050d32014-02-20 18:08:58 +08005849 struct btrfs_log_ctx *ctx)
Chris Masone02119d2008-09-05 16:13:11 -04005850{
Nikolay Borisovf8822742018-02-27 17:37:17 +02005851 struct btrfs_root *root = inode->root;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005852 struct btrfs_fs_info *fs_info = root->fs_info;
Chris Masone02119d2008-09-05 16:13:11 -04005853 struct super_block *sb;
Josef Bacik6a912212010-11-20 09:48:00 +00005854 struct dentry *old_parent = NULL;
Chris Mason12fcfd22009-03-24 10:24:20 -04005855 int ret = 0;
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005856 u64 last_committed = fs_info->last_trans_committed;
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005857 bool log_dentries = false;
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005858 struct btrfs_inode *orig_inode = inode;
Chris Mason12fcfd22009-03-24 10:24:20 -04005859
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005860 sb = inode->vfs_inode.i_sb;
Chris Mason12fcfd22009-03-24 10:24:20 -04005861
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005862 if (btrfs_test_opt(fs_info, NOTREELOG)) {
Sage Weil3a5e1402009-04-02 16:49:40 -04005863 ret = 1;
5864 goto end_no_trans;
5865 }
5866
Miao Xie995946d2014-04-02 19:51:06 +08005867 /*
5868 * The prev transaction commit doesn't complete, we need do
5869 * full commit by ourselves.
5870 */
Jeff Mahoney0b246af2016-06-22 18:54:23 -04005871 if (fs_info->last_trans_log_full_commit >
5872 fs_info->last_trans_committed) {
Chris Mason12fcfd22009-03-24 10:24:20 -04005873 ret = 1;
5874 goto end_no_trans;
5875 }
5876
Nikolay Borisovf8822742018-02-27 17:37:17 +02005877 if (btrfs_root_refs(&root->root_item) == 0) {
Yan, Zheng76dda932009-09-21 16:00:26 -04005878 ret = 1;
5879 goto end_no_trans;
5880 }
5881
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005882 ret = check_parent_dirs_for_sync(trans, inode, parent, sb,
5883 last_committed);
Chris Mason12fcfd22009-03-24 10:24:20 -04005884 if (ret)
5885 goto end_no_trans;
Chris Masone02119d2008-09-05 16:13:11 -04005886
Filipe Mananaf2d72f42018-10-08 11:12:55 +01005887 /*
5888 * Skip already logged inodes or inodes corresponding to tmpfiles
5889 * (since logging them is pointless, a link count of 0 means they
5890 * will never be accessible).
5891 */
5892 if (btrfs_inode_in_log(inode, trans->transid) ||
5893 inode->vfs_inode.i_nlink == 0) {
Chris Mason257c62e2009-10-13 13:21:08 -04005894 ret = BTRFS_NO_LOG_SYNC;
5895 goto end_no_trans;
5896 }
5897
Miao Xie8b050d32014-02-20 18:08:58 +08005898 ret = start_log_trans(trans, root, ctx);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005899 if (ret)
Miao Xiee87ac132014-02-20 18:08:53 +08005900 goto end_no_trans;
Chris Mason12fcfd22009-03-24 10:24:20 -04005901
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005902 ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005903 if (ret)
5904 goto end_trans;
Chris Mason12fcfd22009-03-24 10:24:20 -04005905
Chris Masonaf4176b2009-03-24 10:24:31 -04005906 /*
5907 * for regular files, if its inode is already on disk, we don't
5908 * have to worry about the parents at all. This is because
5909 * we can use the last_unlink_trans field to record renames
5910 * and other fun in this file.
5911 */
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005912 if (S_ISREG(inode->vfs_inode.i_mode) &&
5913 inode->generation <= last_committed &&
5914 inode->last_unlink_trans <= last_committed) {
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005915 ret = 0;
5916 goto end_trans;
5917 }
Chris Masonaf4176b2009-03-24 10:24:31 -04005918
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005919 if (S_ISDIR(inode->vfs_inode.i_mode) && ctx && ctx->log_new_dentries)
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00005920 log_dentries = true;
5921
Filipe Manana18aa0922015-08-05 16:49:08 +01005922 /*
Nicholas D Steeves01327612016-05-19 21:18:45 -04005923 * On unlink we must make sure all our current and old parent directory
Filipe Manana18aa0922015-08-05 16:49:08 +01005924 * inodes are fully logged. This is to prevent leaving dangling
5925 * directory index entries in directories that were our parents but are
5926 * not anymore. Not doing this results in old parent directory being
5927 * impossible to delete after log replay (rmdir will always fail with
5928 * error -ENOTEMPTY).
5929 *
5930 * Example 1:
5931 *
5932 * mkdir testdir
5933 * touch testdir/foo
5934 * ln testdir/foo testdir/bar
5935 * sync
5936 * unlink testdir/bar
5937 * xfs_io -c fsync testdir/foo
5938 * <power failure>
5939 * mount fs, triggers log replay
5940 *
5941 * If we don't log the parent directory (testdir), after log replay the
5942 * directory still has an entry pointing to the file inode using the bar
5943 * name, but a matching BTRFS_INODE_[REF|EXTREF]_KEY does not exist and
5944 * the file inode has a link count of 1.
5945 *
5946 * Example 2:
5947 *
5948 * mkdir testdir
5949 * touch foo
5950 * ln foo testdir/foo2
5951 * ln foo testdir/foo3
5952 * sync
5953 * unlink testdir/foo3
5954 * xfs_io -c fsync foo
5955 * <power failure>
5956 * mount fs, triggers log replay
5957 *
5958 * Similar as the first example, after log replay the parent directory
5959 * testdir still has an entry pointing to the inode file with name foo3
5960 * but the file inode does not have a matching BTRFS_INODE_REF_KEY item
5961 * and has a link count of 2.
5962 */
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005963 if (inode->last_unlink_trans > last_committed) {
Filipe Manana18aa0922015-08-05 16:49:08 +01005964 ret = btrfs_log_all_parents(trans, orig_inode, ctx);
5965 if (ret)
5966 goto end_trans;
5967 }
5968
Filipe Manana41bd6062018-11-28 14:54:28 +00005969 /*
5970 * If a new hard link was added to the inode in the current transaction
5971 * and its link count is now greater than 1, we need to fallback to a
5972 * transaction commit, otherwise we can end up not logging all its new
5973 * parents for all the hard links. Here just from the dentry used to
5974 * fsync, we can not visit the ancestor inodes for all the other hard
5975 * links to figure out if any is new, so we fallback to a transaction
5976 * commit (instead of adding a lot of complexity of scanning a btree,
5977 * since this scenario is not a common use case).
5978 */
5979 if (inode->vfs_inode.i_nlink > 1 &&
5980 inode->last_link_trans > last_committed) {
5981 ret = -EMLINK;
5982 goto end_trans;
5983 }
5984
Chris Masond3977122009-01-05 21:25:51 -05005985 while (1) {
Al Virofc640052016-04-10 01:33:30 -04005986 if (!parent || d_really_is_negative(parent) || sb != parent->d_sb)
Chris Masone02119d2008-09-05 16:13:11 -04005987 break;
5988
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005989 inode = BTRFS_I(d_inode(parent));
5990 if (root != inode->root)
Yan, Zheng76dda932009-09-21 16:00:26 -04005991 break;
5992
Nikolay Borisov19df27a2017-02-20 13:51:01 +02005993 if (inode->generation > last_committed) {
5994 ret = btrfs_log_inode(trans, root, inode,
5995 LOG_INODE_EXISTS, 0, LLONG_MAX, ctx);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04005996 if (ret)
5997 goto end_trans;
Chris Mason12fcfd22009-03-24 10:24:20 -04005998 }
Yan, Zheng76dda932009-09-21 16:00:26 -04005999 if (IS_ROOT(parent))
Chris Masone02119d2008-09-05 16:13:11 -04006000 break;
Chris Mason12fcfd22009-03-24 10:24:20 -04006001
Josef Bacik6a912212010-11-20 09:48:00 +00006002 parent = dget_parent(parent);
6003 dput(old_parent);
6004 old_parent = parent;
Chris Masone02119d2008-09-05 16:13:11 -04006005 }
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00006006 if (log_dentries)
Nikolay Borisov19df27a2017-02-20 13:51:01 +02006007 ret = log_new_dir_dentries(trans, root, orig_inode, ctx);
Filipe Manana2f2ff0e2015-03-20 17:19:46 +00006008 else
6009 ret = 0;
Yan, Zheng4a500fd2010-05-16 10:49:59 -04006010end_trans:
Josef Bacik6a912212010-11-20 09:48:00 +00006011 dput(old_parent);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04006012 if (ret < 0) {
David Sterba90787762019-03-20 13:28:05 +01006013 btrfs_set_log_full_commit(trans);
Yan, Zheng4a500fd2010-05-16 10:49:59 -04006014 ret = 1;
6015 }
Miao Xie8b050d32014-02-20 18:08:58 +08006016
6017 if (ret)
6018 btrfs_remove_log_ctx(root, ctx);
Chris Mason12fcfd22009-03-24 10:24:20 -04006019 btrfs_end_log_trans(root);
6020end_no_trans:
6021 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04006022}
6023
6024/*
6025 * it is not safe to log dentry if the chunk root has added new
6026 * chunks. This returns 0 if the dentry was logged, and 1 otherwise.
6027 * If this returns 1, you must commit the transaction to safely get your
6028 * data on disk.
6029 */
6030int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
Nikolay Borisove5b84f7a2018-02-27 17:37:18 +02006031 struct dentry *dentry,
Filipe Manana49dae1b2014-09-06 22:34:39 +01006032 const loff_t start,
6033 const loff_t end,
Miao Xie8b050d32014-02-20 18:08:58 +08006034 struct btrfs_log_ctx *ctx)
Chris Masone02119d2008-09-05 16:13:11 -04006035{
Josef Bacik6a912212010-11-20 09:48:00 +00006036 struct dentry *parent = dget_parent(dentry);
6037 int ret;
6038
Nikolay Borisovf8822742018-02-27 17:37:17 +02006039 ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent,
6040 start, end, LOG_INODE_ALL, ctx);
Josef Bacik6a912212010-11-20 09:48:00 +00006041 dput(parent);
6042
6043 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04006044}
6045
6046/*
6047 * should be called during mount to recover any replay any log trees
6048 * from the FS
6049 */
6050int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
6051{
6052 int ret;
6053 struct btrfs_path *path;
6054 struct btrfs_trans_handle *trans;
6055 struct btrfs_key key;
6056 struct btrfs_key found_key;
6057 struct btrfs_key tmp_key;
6058 struct btrfs_root *log;
6059 struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
6060 struct walk_control wc = {
6061 .process_func = process_one_buffer,
6062 .stage = 0,
6063 };
6064
Chris Masone02119d2008-09-05 16:13:11 -04006065 path = btrfs_alloc_path();
Tsutomu Itohdb5b4932011-03-23 08:14:16 +00006066 if (!path)
6067 return -ENOMEM;
6068
Josef Bacikafcdd122016-09-02 15:40:02 -04006069 set_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
Chris Masone02119d2008-09-05 16:13:11 -04006070
Yan, Zheng4a500fd2010-05-16 10:49:59 -04006071 trans = btrfs_start_transaction(fs_info->tree_root, 0);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006072 if (IS_ERR(trans)) {
6073 ret = PTR_ERR(trans);
6074 goto error;
6075 }
Chris Masone02119d2008-09-05 16:13:11 -04006076
6077 wc.trans = trans;
6078 wc.pin = 1;
6079
Tsutomu Itohdb5b4932011-03-23 08:14:16 +00006080 ret = walk_log_tree(trans, log_root_tree, &wc);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006081 if (ret) {
Jeff Mahoney5d163e02016-09-20 10:05:00 -04006082 btrfs_handle_fs_error(fs_info, ret,
6083 "Failed to pin buffers while recovering log root tree.");
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006084 goto error;
6085 }
Chris Masone02119d2008-09-05 16:13:11 -04006086
6087again:
6088 key.objectid = BTRFS_TREE_LOG_OBJECTID;
6089 key.offset = (u64)-1;
David Sterba962a2982014-06-04 18:41:45 +02006090 key.type = BTRFS_ROOT_ITEM_KEY;
Chris Masone02119d2008-09-05 16:13:11 -04006091
Chris Masond3977122009-01-05 21:25:51 -05006092 while (1) {
Chris Masone02119d2008-09-05 16:13:11 -04006093 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006094
6095 if (ret < 0) {
Anand Jain34d97002016-03-16 16:43:06 +08006096 btrfs_handle_fs_error(fs_info, ret,
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006097 "Couldn't find tree log root.");
6098 goto error;
6099 }
Chris Masone02119d2008-09-05 16:13:11 -04006100 if (ret > 0) {
6101 if (path->slots[0] == 0)
6102 break;
6103 path->slots[0]--;
6104 }
6105 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
6106 path->slots[0]);
David Sterbab3b4aa72011-04-21 01:20:15 +02006107 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04006108 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
6109 break;
6110
Miao Xiecb517ea2013-05-15 07:48:19 +00006111 log = btrfs_read_fs_root(log_root_tree, &found_key);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006112 if (IS_ERR(log)) {
6113 ret = PTR_ERR(log);
Anand Jain34d97002016-03-16 16:43:06 +08006114 btrfs_handle_fs_error(fs_info, ret,
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006115 "Couldn't read tree log root.");
6116 goto error;
6117 }
Chris Masone02119d2008-09-05 16:13:11 -04006118
6119 tmp_key.objectid = found_key.offset;
6120 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
6121 tmp_key.offset = (u64)-1;
6122
6123 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006124 if (IS_ERR(wc.replay_dest)) {
6125 ret = PTR_ERR(wc.replay_dest);
Josef Bacikb50c6e22013-04-25 15:55:30 -04006126 free_extent_buffer(log->node);
6127 free_extent_buffer(log->commit_root);
6128 kfree(log);
Jeff Mahoney5d163e02016-09-20 10:05:00 -04006129 btrfs_handle_fs_error(fs_info, ret,
6130 "Couldn't read target root for tree log recovery.");
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006131 goto error;
6132 }
Chris Masone02119d2008-09-05 16:13:11 -04006133
Yan Zheng07d400a2009-01-06 11:42:00 -05006134 wc.replay_dest->log_root = log;
Yan Zheng5d4f98a2009-06-10 10:45:14 -04006135 btrfs_record_root_in_trans(trans, wc.replay_dest);
Chris Masone02119d2008-09-05 16:13:11 -04006136 ret = walk_log_tree(trans, log, &wc);
Chris Masone02119d2008-09-05 16:13:11 -04006137
Josef Bacikb50c6e22013-04-25 15:55:30 -04006138 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
Chris Masone02119d2008-09-05 16:13:11 -04006139 ret = fixup_inode_link_counts(trans, wc.replay_dest,
6140 path);
Chris Masone02119d2008-09-05 16:13:11 -04006141 }
Chris Masone02119d2008-09-05 16:13:11 -04006142
Liu Bo900c9982018-01-25 11:02:56 -07006143 if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
6144 struct btrfs_root *root = wc.replay_dest;
6145
6146 btrfs_release_path(path);
6147
6148 /*
6149 * We have just replayed everything, and the highest
6150 * objectid of fs roots probably has changed in case
6151 * some inode_item's got replayed.
6152 *
6153 * root->objectid_mutex is not acquired as log replay
6154 * could only happen during mount.
6155 */
6156 ret = btrfs_find_highest_objectid(root,
6157 &root->highest_objectid);
6158 }
6159
Chris Masone02119d2008-09-05 16:13:11 -04006160 key.offset = found_key.offset - 1;
Yan Zheng07d400a2009-01-06 11:42:00 -05006161 wc.replay_dest->log_root = NULL;
Chris Masone02119d2008-09-05 16:13:11 -04006162 free_extent_buffer(log->node);
Chris Masonb263c2c2009-06-11 11:24:47 -04006163 free_extent_buffer(log->commit_root);
Chris Masone02119d2008-09-05 16:13:11 -04006164 kfree(log);
6165
Josef Bacikb50c6e22013-04-25 15:55:30 -04006166 if (ret)
6167 goto error;
6168
Chris Masone02119d2008-09-05 16:13:11 -04006169 if (found_key.offset == 0)
6170 break;
6171 }
David Sterbab3b4aa72011-04-21 01:20:15 +02006172 btrfs_release_path(path);
Chris Masone02119d2008-09-05 16:13:11 -04006173
6174 /* step one is to pin it all, step two is to replay just inodes */
6175 if (wc.pin) {
6176 wc.pin = 0;
6177 wc.process_func = replay_one_buffer;
6178 wc.stage = LOG_WALK_REPLAY_INODES;
6179 goto again;
6180 }
6181 /* step three is to replay everything */
6182 if (wc.stage < LOG_WALK_REPLAY_ALL) {
6183 wc.stage++;
6184 goto again;
6185 }
6186
6187 btrfs_free_path(path);
6188
Josef Bacikabefa552013-04-24 16:40:05 -04006189 /* step 4: commit the transaction, which also unpins the blocks */
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04006190 ret = btrfs_commit_transaction(trans);
Josef Bacikabefa552013-04-24 16:40:05 -04006191 if (ret)
6192 return ret;
6193
Chris Masone02119d2008-09-05 16:13:11 -04006194 free_extent_buffer(log_root_tree->node);
6195 log_root_tree->log_root = NULL;
Josef Bacikafcdd122016-09-02 15:40:02 -04006196 clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
Chris Masone02119d2008-09-05 16:13:11 -04006197 kfree(log_root_tree);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006198
Josef Bacikabefa552013-04-24 16:40:05 -04006199 return 0;
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006200error:
Josef Bacikb50c6e22013-04-25 15:55:30 -04006201 if (wc.trans)
Jeff Mahoney3a45bb22016-09-09 21:39:03 -04006202 btrfs_end_transaction(wc.trans);
Jeff Mahoney79787ea2012-03-12 16:03:00 +01006203 btrfs_free_path(path);
6204 return ret;
Chris Masone02119d2008-09-05 16:13:11 -04006205}
Chris Mason12fcfd22009-03-24 10:24:20 -04006206
6207/*
6208 * there are some corner cases where we want to force a full
6209 * commit instead of allowing a directory to be logged.
6210 *
6211 * They revolve around files there were unlinked from the directory, and
6212 * this function updates the parent directory so that a full commit is
6213 * properly done if it is fsync'd later after the unlinks are done.
Filipe Manana2be63d52016-02-12 11:34:23 +00006214 *
6215 * Must be called before the unlink operations (updates to the subvolume tree,
6216 * inodes, etc) are done.
Chris Mason12fcfd22009-03-24 10:24:20 -04006217 */
6218void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
Nikolay Borisov4176bdb2017-01-18 00:31:28 +02006219 struct btrfs_inode *dir, struct btrfs_inode *inode,
Chris Mason12fcfd22009-03-24 10:24:20 -04006220 int for_rename)
6221{
6222 /*
Chris Masonaf4176b2009-03-24 10:24:31 -04006223 * when we're logging a file, if it hasn't been renamed
6224 * or unlinked, and its inode is fully committed on disk,
6225 * we don't have to worry about walking up the directory chain
6226 * to log its parents.
6227 *
6228 * So, we use the last_unlink_trans field to put this transid
6229 * into the file. When the file is logged we check it and
6230 * don't log the parents if the file is fully on disk.
6231 */
Nikolay Borisov4176bdb2017-01-18 00:31:28 +02006232 mutex_lock(&inode->log_mutex);
6233 inode->last_unlink_trans = trans->transid;
6234 mutex_unlock(&inode->log_mutex);
Chris Masonaf4176b2009-03-24 10:24:31 -04006235
6236 /*
Chris Mason12fcfd22009-03-24 10:24:20 -04006237 * if this directory was already logged any new
6238 * names for this file/dir will get recorded
6239 */
6240 smp_mb();
Nikolay Borisov4176bdb2017-01-18 00:31:28 +02006241 if (dir->logged_trans == trans->transid)
Chris Mason12fcfd22009-03-24 10:24:20 -04006242 return;
6243
6244 /*
6245 * if the inode we're about to unlink was logged,
6246 * the log will be properly updated for any new names
6247 */
Nikolay Borisov4176bdb2017-01-18 00:31:28 +02006248 if (inode->logged_trans == trans->transid)
Chris Mason12fcfd22009-03-24 10:24:20 -04006249 return;
6250
6251 /*
6252 * when renaming files across directories, if the directory
6253 * there we're unlinking from gets fsync'd later on, there's
6254 * no way to find the destination directory later and fsync it
6255 * properly. So, we have to be conservative and force commits
6256 * so the new name gets discovered.
6257 */
6258 if (for_rename)
6259 goto record;
6260
6261 /* we can safely do the unlink without any special recording */
6262 return;
6263
6264record:
Nikolay Borisov4176bdb2017-01-18 00:31:28 +02006265 mutex_lock(&dir->log_mutex);
6266 dir->last_unlink_trans = trans->transid;
6267 mutex_unlock(&dir->log_mutex);
Chris Mason12fcfd22009-03-24 10:24:20 -04006268}
6269
6270/*
Filipe Manana1ec9a1a2016-02-10 10:42:25 +00006271 * Make sure that if someone attempts to fsync the parent directory of a deleted
6272 * snapshot, it ends up triggering a transaction commit. This is to guarantee
6273 * that after replaying the log tree of the parent directory's root we will not
6274 * see the snapshot anymore and at log replay time we will not see any log tree
6275 * corresponding to the deleted snapshot's root, which could lead to replaying
6276 * it after replaying the log tree of the parent directory (which would replay
6277 * the snapshot delete operation).
Filipe Manana2be63d52016-02-12 11:34:23 +00006278 *
6279 * Must be called before the actual snapshot destroy operation (updates to the
6280 * parent root and tree of tree roots trees, etc) are done.
Filipe Manana1ec9a1a2016-02-10 10:42:25 +00006281 */
6282void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
Nikolay Borisov43663552017-01-18 00:31:29 +02006283 struct btrfs_inode *dir)
Filipe Manana1ec9a1a2016-02-10 10:42:25 +00006284{
Nikolay Borisov43663552017-01-18 00:31:29 +02006285 mutex_lock(&dir->log_mutex);
6286 dir->last_unlink_trans = trans->transid;
6287 mutex_unlock(&dir->log_mutex);
Filipe Manana1ec9a1a2016-02-10 10:42:25 +00006288}
6289
6290/*
Chris Mason12fcfd22009-03-24 10:24:20 -04006291 * Call this after adding a new name for a file and it will properly
6292 * update the log to reflect the new name.
6293 *
Filipe Mananad4682ba2018-06-11 19:24:28 +01006294 * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
6295 * true (because it's not used).
6296 *
6297 * Return value depends on whether @sync_log is true or false.
6298 * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
6299 * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
6300 * otherwise.
6301 * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
6302 * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
6303 * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
6304 * committed (without attempting to sync the log).
Chris Mason12fcfd22009-03-24 10:24:20 -04006305 */
6306int btrfs_log_new_name(struct btrfs_trans_handle *trans,
Nikolay Borisov9ca5fbfb2017-01-18 00:31:31 +02006307 struct btrfs_inode *inode, struct btrfs_inode *old_dir,
Filipe Mananad4682ba2018-06-11 19:24:28 +01006308 struct dentry *parent,
6309 bool sync_log, struct btrfs_log_ctx *ctx)
Chris Mason12fcfd22009-03-24 10:24:20 -04006310{
David Sterba3ffbd682018-06-29 10:56:42 +02006311 struct btrfs_fs_info *fs_info = trans->fs_info;
Filipe Mananad4682ba2018-06-11 19:24:28 +01006312 int ret;
Chris Mason12fcfd22009-03-24 10:24:20 -04006313
6314 /*
Chris Masonaf4176b2009-03-24 10:24:31 -04006315 * this will force the logging code to walk the dentry chain
6316 * up for the file
6317 */
Filipe Manana9a6509c2018-02-28 15:55:40 +00006318 if (!S_ISDIR(inode->vfs_inode.i_mode))
Nikolay Borisov9ca5fbfb2017-01-18 00:31:31 +02006319 inode->last_unlink_trans = trans->transid;
Chris Masonaf4176b2009-03-24 10:24:31 -04006320
6321 /*
Chris Mason12fcfd22009-03-24 10:24:20 -04006322 * if this inode hasn't been logged and directory we're renaming it
6323 * from hasn't been logged, we don't need to log it
6324 */
Nikolay Borisov9ca5fbfb2017-01-18 00:31:31 +02006325 if (inode->logged_trans <= fs_info->last_trans_committed &&
6326 (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
Filipe Mananad4682ba2018-06-11 19:24:28 +01006327 return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
6328 BTRFS_DONT_NEED_LOG_SYNC;
Chris Mason12fcfd22009-03-24 10:24:20 -04006329
Filipe Mananad4682ba2018-06-11 19:24:28 +01006330 if (sync_log) {
6331 struct btrfs_log_ctx ctx2;
6332
6333 btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
6334 ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
6335 LOG_INODE_EXISTS, &ctx2);
6336 if (ret == BTRFS_NO_LOG_SYNC)
6337 return BTRFS_DONT_NEED_TRANS_COMMIT;
6338 else if (ret)
6339 return BTRFS_NEED_TRANS_COMMIT;
6340
6341 ret = btrfs_sync_log(trans, inode->root, &ctx2);
6342 if (ret)
6343 return BTRFS_NEED_TRANS_COMMIT;
6344 return BTRFS_DONT_NEED_TRANS_COMMIT;
6345 }
6346
6347 ASSERT(ctx);
6348 ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
6349 LOG_INODE_EXISTS, ctx);
6350 if (ret == BTRFS_NO_LOG_SYNC)
6351 return BTRFS_DONT_NEED_LOG_SYNC;
6352 else if (ret)
6353 return BTRFS_NEED_TRANS_COMMIT;
6354
6355 return BTRFS_NEED_LOG_SYNC;
Chris Mason12fcfd22009-03-24 10:24:20 -04006356}
6357