blob: 57bd2ff9788828fb384b10fb107f7b05cb871fa7 [file] [log] [blame]
Dave Kleikamp470decc2006-10-11 01:20:57 -07001/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -07002 * linux/fs/jbd2/commit.c
Dave Kleikamp470decc2006-10-11 01:20:57 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
Mingming Caof7f4bcc2006-10-11 01:20:59 -070018#include <linux/jbd2.h>
Dave Kleikamp470decc2006-10-11 01:20:57 -070019#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
Johann Lombardi8e85fb32008-01-28 23:58:27 -050023#include <linux/jiffies.h>
Girish Shilamkar818d2762008-01-28 23:58:27 -050024#include <linux/crc32.h>
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -040025#include <linux/writeback.h>
26#include <linux/backing-dev.h>
Theodore Ts'ofd984962009-01-05 21:34:13 -050027#include <linux/bio.h>
Theodore Ts'o0e3d2a62009-09-11 09:30:12 -040028#include <linux/blkdev.h>
Brian King39e3ac22010-10-27 21:25:12 -040029#include <linux/bitops.h>
Theodore Ts'o879c5e62009-06-17 11:47:48 -040030#include <trace/events/jbd2.h>
Dave Kleikamp470decc2006-10-11 01:20:57 -070031
32/*
33 * Default IO end handler for temporary BJ_IO buffer_heads.
34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{
37 BUFFER_TRACE(bh, "");
38 if (uptodate)
39 set_buffer_uptodate(bh);
40 else
41 clear_buffer_uptodate(bh);
42 unlock_buffer(bh);
43}
44
45/*
Jan Kara87c89c22008-07-11 19:27:31 -040046 * When an ext4 file is truncated, it is possible that some pages are not
47 * successfully freed, because they are attached to a committing transaction.
Dave Kleikamp470decc2006-10-11 01:20:57 -070048 * After the transaction commits, these pages are left on the LRU, with no
49 * ->mapping, and with attached buffers. These pages are trivially reclaimable
50 * by the VM, but their apparent absence upsets the VM accounting, and it makes
51 * the numbers in /proc/meminfo look odd.
52 *
53 * So here, we have a buffer which has just come off the forget list. Look to
54 * see if we can strip all buffers from the backing page.
55 *
56 * Called under lock_journal(), and possibly under journal_datalist_lock. The
57 * caller provided us with a ref against the buffer, and we drop that here.
58 */
59static void release_buffer_page(struct buffer_head *bh)
60{
61 struct page *page;
62
63 if (buffer_dirty(bh))
64 goto nope;
65 if (atomic_read(&bh->b_count) != 1)
66 goto nope;
67 page = bh->b_page;
68 if (!page)
69 goto nope;
70 if (page->mapping)
71 goto nope;
72
73 /* OK, it's a truncated page */
Nick Piggin529ae9a2008-08-02 12:01:03 +020074 if (!trylock_page(page))
Dave Kleikamp470decc2006-10-11 01:20:57 -070075 goto nope;
76
77 page_cache_get(page);
78 __brelse(bh);
79 try_to_free_buffers(page);
80 unlock_page(page);
81 page_cache_release(page);
82 return;
83
84nope:
85 __brelse(bh);
86}
87
Darrick J. Wong1f56c582012-05-27 08:10:25 -040088static void jbd2_commit_block_csum_set(journal_t *j,
89 struct journal_head *descriptor)
90{
91 struct commit_header *h;
92 __u32 csum;
93
94 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
95 return;
96
97 h = (struct commit_header *)(jh2bh(descriptor)->b_data);
98 h->h_chksum_type = 0;
99 h->h_chksum_size = 0;
100 h->h_chksum[0] = 0;
101 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
102 j->j_blocksize);
103 h->h_chksum[0] = cpu_to_be32(csum);
104}
105
Dave Kleikamp470decc2006-10-11 01:20:57 -0700106/*
Girish Shilamkar818d2762008-01-28 23:58:27 -0500107 * Done it all: now submit the commit record. We should have
Dave Kleikamp470decc2006-10-11 01:20:57 -0700108 * cleaned up our previous buffers by now, so if we are in abort
109 * mode we can now just skip the rest of the journal write
110 * entirely.
111 *
112 * Returns 1 if the journal needs to be aborted or 0 on success
113 */
Girish Shilamkar818d2762008-01-28 23:58:27 -0500114static int journal_submit_commit_record(journal_t *journal,
115 transaction_t *commit_transaction,
116 struct buffer_head **cbh,
117 __u32 crc32_sum)
Dave Kleikamp470decc2006-10-11 01:20:57 -0700118{
119 struct journal_head *descriptor;
Girish Shilamkar818d2762008-01-28 23:58:27 -0500120 struct commit_header *tmp;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700121 struct buffer_head *bh;
Girish Shilamkar818d2762008-01-28 23:58:27 -0500122 int ret;
Theodore Ts'o736603a2008-07-11 19:27:31 -0400123 struct timespec now = current_kernel_time();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700124
Zhang Huan6cba6112011-04-05 19:16:20 -0400125 *cbh = NULL;
126
Dave Kleikamp470decc2006-10-11 01:20:57 -0700127 if (is_journal_aborted(journal))
128 return 0;
129
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700130 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700131 if (!descriptor)
132 return 1;
133
134 bh = jh2bh(descriptor);
135
Girish Shilamkar818d2762008-01-28 23:58:27 -0500136 tmp = (struct commit_header *)bh->b_data;
137 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
138 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
139 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Theodore Ts'o736603a2008-07-11 19:27:31 -0400140 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500142
143 if (JBD2_HAS_COMPAT_FEATURE(journal,
144 JBD2_FEATURE_COMPAT_CHECKSUM)) {
145 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
146 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
147 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700148 }
Darrick J. Wong1f56c582012-05-27 08:10:25 -0400149 jbd2_commit_block_csum_set(journal, descriptor);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700150
Girish Shilamkar818d2762008-01-28 23:58:27 -0500151 JBUFFER_TRACE(descriptor, "submit commit block");
152 lock_buffer(bh);
Theodore Ts'o45a90bf2008-10-06 12:04:02 -0400153 clear_buffer_dirty(bh);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500154 set_buffer_uptodate(bh);
155 bh->b_end_io = journal_end_buffer_io_sync;
156
157 if (journal->j_flags & JBD2_BARRIER &&
Theodore Ts'o0e3d2a62009-09-11 09:30:12 -0400158 !JBD2_HAS_INCOMPAT_FEATURE(journal,
Christoph Hellwig9c355752010-08-18 05:29:17 -0400159 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
Jens Axboe721a9602011-03-09 11:56:30 +0100160 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
Christoph Hellwig9c355752010-08-18 05:29:17 -0400161 else
Jens Axboe721a9602011-03-09 11:56:30 +0100162 ret = submit_bh(WRITE_SYNC, bh);
Christoph Hellwig9c355752010-08-18 05:29:17 -0400163
Girish Shilamkar818d2762008-01-28 23:58:27 -0500164 *cbh = bh;
165 return ret;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700166}
167
Girish Shilamkar818d2762008-01-28 23:58:27 -0500168/*
169 * This function along with journal_submit_commit_record
170 * allows to write the commit record asynchronously.
171 */
Theodore Ts'ofd984962009-01-05 21:34:13 -0500172static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
Girish Shilamkar818d2762008-01-28 23:58:27 -0500174{
175 int ret = 0;
176
177 clear_buffer_dirty(bh);
178 wait_on_buffer(bh);
179
180 if (unlikely(!buffer_uptodate(bh)))
181 ret = -EIO;
182 put_bh(bh); /* One for getblk() */
183 jbd2_journal_put_journal_head(bh2jh(bh));
184
185 return ret;
186}
187
188/*
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400189 * write the filemap data using writepage() address_space_operations.
190 * We don't do block allocation here even for delalloc. We don't
191 * use writepages() because with dealyed allocation we may be doing
192 * block allocation in writepages().
193 */
194static int journal_submit_inode_data_buffers(struct address_space *mapping)
195{
196 int ret;
197 struct writeback_control wbc = {
198 .sync_mode = WB_SYNC_ALL,
199 .nr_to_write = mapping->nrpages * 2,
200 .range_start = 0,
201 .range_end = i_size_read(mapping->host),
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400202 };
203
204 ret = generic_writepages(mapping, &wbc);
205 return ret;
206}
207
208/*
Jan Karac851ed52008-07-11 19:27:31 -0400209 * Submit all the data buffers of inode associated with the transaction to
210 * disk.
211 *
212 * We are in a committing transaction. Therefore no new inode can be added to
213 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
214 * operate on from being released while we write out pages.
215 */
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400216static int journal_submit_data_buffers(journal_t *journal,
Jan Karac851ed52008-07-11 19:27:31 -0400217 transaction_t *commit_transaction)
218{
219 struct jbd2_inode *jinode;
220 int err, ret = 0;
221 struct address_space *mapping;
222
223 spin_lock(&journal->j_list_lock);
224 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
225 mapping = jinode->i_vfs_inode->i_mapping;
Brian King39e3ac22010-10-27 21:25:12 -0400226 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
Jan Karac851ed52008-07-11 19:27:31 -0400227 spin_unlock(&journal->j_list_lock);
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400228 /*
229 * submit the inode data buffers. We use writepage
230 * instead of writepages. Because writepages can do
231 * block allocation with delalloc. We need to write
232 * only allocated blocks here.
233 */
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400234 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400235 err = journal_submit_inode_data_buffers(mapping);
Jan Karac851ed52008-07-11 19:27:31 -0400236 if (!ret)
237 ret = err;
238 spin_lock(&journal->j_list_lock);
239 J_ASSERT(jinode->i_transaction == commit_transaction);
Brian King39e3ac22010-10-27 21:25:12 -0400240 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
241 smp_mb__after_clear_bit();
Jan Karac851ed52008-07-11 19:27:31 -0400242 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243 }
244 spin_unlock(&journal->j_list_lock);
245 return ret;
246}
247
248/*
249 * Wait for data submitted for writeout, refile inodes to proper
250 * transaction if needed.
251 *
252 */
253static int journal_finish_inode_data_buffers(journal_t *journal,
254 transaction_t *commit_transaction)
255{
256 struct jbd2_inode *jinode, *next_i;
257 int err, ret = 0;
258
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400259 /* For locking, see the comment in journal_submit_data_buffers() */
Jan Karac851ed52008-07-11 19:27:31 -0400260 spin_lock(&journal->j_list_lock);
261 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
Brian King39e3ac22010-10-27 21:25:12 -0400262 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
Jan Karac851ed52008-07-11 19:27:31 -0400263 spin_unlock(&journal->j_list_lock);
264 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400265 if (err) {
266 /*
267 * Because AS_EIO is cleared by
Christoph Hellwig94004ed2009-09-30 22:16:33 +0200268 * filemap_fdatawait_range(), set it again so
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400269 * that user process can get -EIO from fsync().
270 */
271 set_bit(AS_EIO,
272 &jinode->i_vfs_inode->i_mapping->flags);
273
274 if (!ret)
275 ret = err;
276 }
Jan Karac851ed52008-07-11 19:27:31 -0400277 spin_lock(&journal->j_list_lock);
Brian King39e3ac22010-10-27 21:25:12 -0400278 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
279 smp_mb__after_clear_bit();
Jan Karac851ed52008-07-11 19:27:31 -0400280 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
281 }
282
283 /* Now refile inode to proper lists */
284 list_for_each_entry_safe(jinode, next_i,
285 &commit_transaction->t_inode_list, i_list) {
286 list_del(&jinode->i_list);
287 if (jinode->i_next_transaction) {
288 jinode->i_transaction = jinode->i_next_transaction;
289 jinode->i_next_transaction = NULL;
290 list_add(&jinode->i_list,
291 &jinode->i_transaction->t_inode_list);
292 } else {
293 jinode->i_transaction = NULL;
294 }
295 }
296 spin_unlock(&journal->j_list_lock);
297
298 return ret;
299}
300
Girish Shilamkar818d2762008-01-28 23:58:27 -0500301static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302{
303 struct page *page = bh->b_page;
304 char *addr;
305 __u32 checksum;
306
Cong Wang303a8f22011-11-25 23:14:31 +0800307 addr = kmap_atomic(page);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500308 checksum = crc32_be(crc32_sum,
309 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
Cong Wang303a8f22011-11-25 23:14:31 +0800310 kunmap_atomic(addr);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500311
312 return checksum;
313}
314
315static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
Mingming Cao18eba7a2006-10-11 01:21:13 -0700316 unsigned long long block)
Zach Brownb517bea2006-10-11 01:21:08 -0700317{
318 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
Mingming Caocd02ff02007-10-16 18:38:25 -0400319 if (tag_bytes > JBD2_TAG_SIZE32)
Zach Brownb517bea2006-10-11 01:21:08 -0700320 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
321}
322
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400323static void jbd2_descr_block_csum_set(journal_t *j,
324 struct journal_head *descriptor)
325{
326 struct jbd2_journal_block_tail *tail;
327 __u32 csum;
328
329 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
330 return;
331
332 tail = (struct jbd2_journal_block_tail *)
333 (jh2bh(descriptor)->b_data + j->j_blocksize -
334 sizeof(struct jbd2_journal_block_tail));
335 tail->t_checksum = 0;
336 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
337 j->j_blocksize);
338 tail->t_checksum = cpu_to_be32(csum);
339}
340
Darrick J. Wongc3900872012-05-27 08:12:12 -0400341static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
342 struct buffer_head *bh, __u32 sequence)
343{
344 struct page *page = bh->b_page;
345 __u8 *addr;
Darrick J. Wongeee06c52013-05-28 07:31:59 -0400346 __u32 csum32;
Darrick J. Wongc3900872012-05-27 08:12:12 -0400347
348 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
349 return;
350
351 sequence = cpu_to_be32(sequence);
Cong Wang906adea2012-06-23 11:24:48 +0800352 addr = kmap_atomic(page);
Darrick J. Wongeee06c52013-05-28 07:31:59 -0400353 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
354 sizeof(sequence));
355 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
356 bh->b_size);
Cong Wang906adea2012-06-23 11:24:48 +0800357 kunmap_atomic(addr);
Darrick J. Wongc3900872012-05-27 08:12:12 -0400358
Darrick J. Wongeee06c52013-05-28 07:31:59 -0400359 /* We only have space to store the lower 16 bits of the crc32c. */
360 tag->t_checksum = cpu_to_be16(csum32);
Darrick J. Wongc3900872012-05-27 08:12:12 -0400361}
Dave Kleikamp470decc2006-10-11 01:20:57 -0700362/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700363 * jbd2_journal_commit_transaction
Dave Kleikamp470decc2006-10-11 01:20:57 -0700364 *
365 * The primary function for committing a transaction to the log. This
366 * function is called by the journal thread to begin a complete commit.
367 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700368void jbd2_journal_commit_transaction(journal_t *journal)
Dave Kleikamp470decc2006-10-11 01:20:57 -0700369{
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500370 struct transaction_stats_s stats;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700371 transaction_t *commit_transaction;
Jan Karaf5113ef2013-06-04 12:01:45 -0400372 struct journal_head *jh, *descriptor;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700373 struct buffer_head **wbuf = journal->j_wbuf;
374 int bufs;
375 int flags;
376 int err;
Mingming Cao18eba7a2006-10-11 01:21:13 -0700377 unsigned long long blocknr;
Josef Bacike07f7182008-11-26 01:14:26 -0500378 ktime_t start_time;
379 u64 commit_time;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700380 char *tagp = NULL;
381 journal_header_t *header;
382 journal_block_tag_t *tag = NULL;
383 int space_left = 0;
384 int first_tag = 0;
385 int tag_flag;
Dmitry Monakhov794446c2013-04-03 22:06:52 -0400386 int i;
Zach Brownb517bea2006-10-11 01:21:08 -0700387 int tag_bytes = journal_tag_bytes(journal);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500388 struct buffer_head *cbh = NULL; /* For transactional checksums */
389 __u32 crc32_sum = ~0;
Jens Axboe82f04ab2011-03-17 11:01:52 +0100390 struct blk_plug plug;
Jan Kara33395782012-03-13 22:45:38 -0400391 /* Tail of the journal */
392 unsigned long first_block;
393 tid_t first_tid;
394 int update_tail;
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400395 int csum_size = 0;
Jan Karaf5113ef2013-06-04 12:01:45 -0400396 LIST_HEAD(io_bufs);
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400397
398 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
399 csum_size = sizeof(struct jbd2_journal_block_tail);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700400
401 /*
402 * First job: lock down the current transaction and wait for
403 * all outstanding updates to complete.
404 */
405
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700406 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
407 if (journal->j_flags & JBD2_FLUSHED) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700408 jbd_debug(3, "super block updated\n");
Jan Karaa78bb112012-03-13 15:43:04 -0400409 mutex_lock(&journal->j_checkpoint_mutex);
Jan Kara79feb522012-03-13 22:22:54 -0400410 /*
411 * We hold j_checkpoint_mutex so tail cannot change under us.
412 * We don't need any special data guarantees for writing sb
413 * since journal is empty and it is ok for write to be
414 * flushed only with transaction commit.
415 */
416 jbd2_journal_update_sb_log_tail(journal,
417 journal->j_tail_sequence,
418 journal->j_tail,
419 WRITE_SYNC);
Jan Karaa78bb112012-03-13 15:43:04 -0400420 mutex_unlock(&journal->j_checkpoint_mutex);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700421 } else {
422 jbd_debug(3, "superblock not updated\n");
423 }
424
425 J_ASSERT(journal->j_running_transaction != NULL);
426 J_ASSERT(journal->j_committing_transaction == NULL);
427
428 commit_transaction = journal->j_running_transaction;
429 J_ASSERT(commit_transaction->t_state == T_RUNNING);
430
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400431 trace_jbd2_start_commit(journal, commit_transaction);
Eryu Guanf2a44522011-11-01 19:09:18 -0400432 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
Dave Kleikamp470decc2006-10-11 01:20:57 -0700433 commit_transaction->t_tid);
434
Theodore Ts'oa931da62010-08-03 21:35:12 -0400435 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700436 commit_transaction->t_state = T_LOCKED;
437
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400438 trace_jbd2_commit_locking(journal, commit_transaction);
Theodore Ts'obf699322009-09-30 00:32:06 -0400439 stats.run.rs_wait = commit_transaction->t_max_wait;
Theodore Ts'o9fff24a2013-02-06 22:30:23 -0500440 stats.run.rs_request_delay = 0;
Theodore Ts'obf699322009-09-30 00:32:06 -0400441 stats.run.rs_locked = jiffies;
Theodore Ts'o9fff24a2013-02-06 22:30:23 -0500442 if (commit_transaction->t_requested)
443 stats.run.rs_request_delay =
444 jbd2_time_diff(commit_transaction->t_requested,
445 stats.run.rs_locked);
Theodore Ts'obf699322009-09-30 00:32:06 -0400446 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
447 stats.run.rs_locked);
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500448
Dave Kleikamp470decc2006-10-11 01:20:57 -0700449 spin_lock(&commit_transaction->t_handle_lock);
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400450 while (atomic_read(&commit_transaction->t_updates)) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700451 DEFINE_WAIT(wait);
452
453 prepare_to_wait(&journal->j_wait_updates, &wait,
454 TASK_UNINTERRUPTIBLE);
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400455 if (atomic_read(&commit_transaction->t_updates)) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700456 spin_unlock(&commit_transaction->t_handle_lock);
Theodore Ts'oa931da62010-08-03 21:35:12 -0400457 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700458 schedule();
Theodore Ts'oa931da62010-08-03 21:35:12 -0400459 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700460 spin_lock(&commit_transaction->t_handle_lock);
461 }
462 finish_wait(&journal->j_wait_updates, &wait);
463 }
464 spin_unlock(&commit_transaction->t_handle_lock);
465
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400466 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
Dave Kleikamp470decc2006-10-11 01:20:57 -0700467 journal->j_max_transaction_buffers);
468
469 /*
470 * First thing we are allowed to do is to discard any remaining
471 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
472 * that there are no such buffers: if a large filesystem
473 * operation like a truncate needs to split itself over multiple
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700474 * transactions, then it may try to do a jbd2_journal_restart() while
Dave Kleikamp470decc2006-10-11 01:20:57 -0700475 * there are still BJ_Reserved buffers outstanding. These must
476 * be released cleanly from the current transaction.
477 *
478 * In this case, the filesystem must still reserve write access
479 * again before modifying the buffer in the new transaction, but
480 * we do not require it to remember exactly which old buffers it
481 * has reserved. This is consistent with the existing behaviour
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700482 * that multiple jbd2_journal_get_write_access() calls to the same
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300483 * buffer are perfectly permissible.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700484 */
485 while (commit_transaction->t_reserved_list) {
486 jh = commit_transaction->t_reserved_list;
487 JBUFFER_TRACE(jh, "reserved, unused: refile");
488 /*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700489 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
Dave Kleikamp470decc2006-10-11 01:20:57 -0700490 * leave undo-committed data.
491 */
492 if (jh->b_committed_data) {
493 struct buffer_head *bh = jh2bh(jh);
494
495 jbd_lock_bh_state(bh);
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400496 jbd2_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700497 jh->b_committed_data = NULL;
498 jbd_unlock_bh_state(bh);
499 }
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700500 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700501 }
502
503 /*
504 * Now try to drop any written-back buffers from the journal's
505 * checkpoint lists. We do this *before* commit because it potentially
506 * frees some memory
507 */
508 spin_lock(&journal->j_list_lock);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700509 __jbd2_journal_clean_checkpoint_list(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700510 spin_unlock(&journal->j_list_lock);
511
Eryu Guanf2a44522011-11-01 19:09:18 -0400512 jbd_debug(3, "JBD2: commit phase 1\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700513
514 /*
Yongqiang Yang1ba37262011-12-28 17:46:46 -0500515 * Clear revoked flag to reflect there is no revoked buffers
516 * in the next transaction which is going to be started.
517 */
518 jbd2_clear_buffer_revoked_flags(journal);
519
520 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700521 * Switch to a new revoke table.
522 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700523 jbd2_journal_switch_revoke_table(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700524
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400525 trace_jbd2_commit_flushing(journal, commit_transaction);
Theodore Ts'obf699322009-09-30 00:32:06 -0400526 stats.run.rs_flushing = jiffies;
527 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
528 stats.run.rs_flushing);
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500529
Dave Kleikamp470decc2006-10-11 01:20:57 -0700530 commit_transaction->t_state = T_FLUSH;
531 journal->j_committing_transaction = commit_transaction;
532 journal->j_running_transaction = NULL;
Josef Bacike07f7182008-11-26 01:14:26 -0500533 start_time = ktime_get();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700534 commit_transaction->t_log_start = journal->j_head;
535 wake_up(&journal->j_wait_transaction_locked);
Theodore Ts'oa931da62010-08-03 21:35:12 -0400536 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700537
Eryu Guanf2a44522011-11-01 19:09:18 -0400538 jbd_debug(3, "JBD2: commit phase 2\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700539
540 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700541 * Now start flushing things to disk, in the order they appear
542 * on the transaction lists. Data blocks go first.
543 */
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400544 err = journal_submit_data_buffers(journal, commit_transaction);
Jan Karac851ed52008-07-11 19:27:31 -0400545 if (err)
546 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700547
Jens Axboe82f04ab2011-03-17 11:01:52 +0100548 blk_start_plug(&plug);
Theodore Ts'o67c457a2009-04-14 07:50:56 -0400549 jbd2_journal_write_revoke_records(journal, commit_transaction,
Jens Axboe82f04ab2011-03-17 11:01:52 +0100550 WRITE_SYNC);
551 blk_finish_plug(&plug);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700552
Eryu Guanf2a44522011-11-01 19:09:18 -0400553 jbd_debug(3, "JBD2: commit phase 2\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700554
555 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700556 * Way to go: we have now written out all of the data for a
557 * transaction! Now comes the tricky part: we need to write out
558 * metadata. Loop over the transaction's entire buffer list:
559 */
Theodore Ts'oa931da62010-08-03 21:35:12 -0400560 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700561 commit_transaction->t_state = T_COMMIT;
Theodore Ts'oa931da62010-08-03 21:35:12 -0400562 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700563
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400564 trace_jbd2_commit_logging(journal, commit_transaction);
Theodore Ts'obf699322009-09-30 00:32:06 -0400565 stats.run.rs_logging = jiffies;
566 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
567 stats.run.rs_logging);
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400568 stats.run.rs_blocks =
569 atomic_read(&commit_transaction->t_outstanding_credits);
Theodore Ts'obf699322009-09-30 00:32:06 -0400570 stats.run.rs_blocks_logged = 0;
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500571
Josef Bacik1dfc3222008-04-17 10:38:59 -0400572 J_ASSERT(commit_transaction->t_nr_buffers <=
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400573 atomic_read(&commit_transaction->t_outstanding_credits));
Josef Bacik1dfc3222008-04-17 10:38:59 -0400574
Jan Kara87c89c22008-07-11 19:27:31 -0400575 err = 0;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700576 descriptor = NULL;
577 bufs = 0;
Jens Axboe82f04ab2011-03-17 11:01:52 +0100578 blk_start_plug(&plug);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700579 while (commit_transaction->t_buffers) {
580
581 /* Find the next buffer to be journaled... */
582
583 jh = commit_transaction->t_buffers;
584
585 /* If we're in abort mode, we just un-journal the buffer and
Hidehiro Kawai7ad74452008-10-10 20:29:31 -0400586 release it. */
Dave Kleikamp470decc2006-10-11 01:20:57 -0700587
588 if (is_journal_aborted(journal)) {
Hidehiro Kawai7ad74452008-10-10 20:29:31 -0400589 clear_buffer_jbddirty(jh2bh(jh));
Dave Kleikamp470decc2006-10-11 01:20:57 -0700590 JBUFFER_TRACE(jh, "journal is aborting: refile");
Joel Beckere06c8222008-09-11 15:35:47 -0700591 jbd2_buffer_abort_trigger(jh,
592 jh->b_frozen_data ?
593 jh->b_frozen_triggers :
594 jh->b_triggers);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700595 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700596 /* If that was the last one, we need to clean up
597 * any descriptor buffers which may have been
598 * already allocated, even if we are now
599 * aborting. */
600 if (!commit_transaction->t_buffers)
601 goto start_journal_io;
602 continue;
603 }
604
605 /* Make sure we have a descriptor block in which to
606 record the metadata buffer. */
607
608 if (!descriptor) {
609 struct buffer_head *bh;
610
611 J_ASSERT (bufs == 0);
612
Eryu Guanf2a44522011-11-01 19:09:18 -0400613 jbd_debug(4, "JBD2: get descriptor\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700614
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700615 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700616 if (!descriptor) {
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400617 jbd2_journal_abort(journal, -EIO);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700618 continue;
619 }
620
621 bh = jh2bh(descriptor);
Eryu Guanf2a44522011-11-01 19:09:18 -0400622 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
Dave Kleikamp470decc2006-10-11 01:20:57 -0700623 (unsigned long long)bh->b_blocknr, bh->b_data);
624 header = (journal_header_t *)&bh->b_data[0];
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700625 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
626 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700627 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
628
629 tagp = &bh->b_data[sizeof(journal_header_t)];
630 space_left = bh->b_size - sizeof(journal_header_t);
631 first_tag = 1;
632 set_buffer_jwrite(bh);
633 set_buffer_dirty(bh);
634 wbuf[bufs++] = bh;
635
636 /* Record it so that we can wait for IO
637 completion later */
638 BUFFER_TRACE(bh, "ph3: file as descriptor");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700639 jbd2_journal_file_buffer(descriptor, commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700640 BJ_LogCtl);
641 }
642
643 /* Where is the buffer to be written? */
644
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700645 err = jbd2_journal_next_log_block(journal, &blocknr);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700646 /* If the block mapping failed, just abandon the buffer
647 and repeat this loop: we'll fall into the
648 refile-on-abort condition above. */
649 if (err) {
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400650 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700651 continue;
652 }
653
654 /*
655 * start_this_handle() uses t_outstanding_credits to determine
656 * the free space in the log, but this counter is changed
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700657 * by jbd2_journal_next_log_block() also.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700658 */
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400659 atomic_dec(&commit_transaction->t_outstanding_credits);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700660
661 /* Bump b_count to prevent truncate from stumbling over
662 the shadowed buffer! @@@ This can go if we ever get
Jan Karaf5113ef2013-06-04 12:01:45 -0400663 rid of the shadow pairing of buffers. */
Dave Kleikamp470decc2006-10-11 01:20:57 -0700664 atomic_inc(&jh2bh(jh)->b_count);
665
Dave Kleikamp470decc2006-10-11 01:20:57 -0700666 /*
Jan Karaf5113ef2013-06-04 12:01:45 -0400667 * Make a temporary IO buffer with which to write it out
668 * (this will requeue the metadata buffer to BJ_Shadow).
Dave Kleikamp470decc2006-10-11 01:20:57 -0700669 */
Jan Karaf5113ef2013-06-04 12:01:45 -0400670 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700671 JBUFFER_TRACE(jh, "ph3: write metadata");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700672 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
Jan Karaf5113ef2013-06-04 12:01:45 -0400673 jh, &wbuf[bufs], blocknr);
Theodore Ts'oe6ec1162009-12-01 09:04:42 -0500674 if (flags < 0) {
675 jbd2_journal_abort(journal, flags);
676 continue;
677 }
Jan Karaf5113ef2013-06-04 12:01:45 -0400678 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700679
680 /* Record the new block's tag in the current descriptor
681 buffer */
682
683 tag_flag = 0;
684 if (flags & 1)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700685 tag_flag |= JBD2_FLAG_ESCAPE;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700686 if (!first_tag)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700687 tag_flag |= JBD2_FLAG_SAME_UUID;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700688
689 tag = (journal_block_tag_t *) tagp;
Zach Brownb517bea2006-10-11 01:21:08 -0700690 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
Darrick J. Wong8f888ef2012-05-22 22:43:41 -0400691 tag->t_flags = cpu_to_be16(tag_flag);
Jan Karaf5113ef2013-06-04 12:01:45 -0400692 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
Darrick J. Wongc3900872012-05-27 08:12:12 -0400693 commit_transaction->t_tid);
Zach Brownb517bea2006-10-11 01:21:08 -0700694 tagp += tag_bytes;
695 space_left -= tag_bytes;
Jan Karaf5113ef2013-06-04 12:01:45 -0400696 bufs++;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700697
698 if (first_tag) {
699 memcpy (tagp, journal->j_uuid, 16);
700 tagp += 16;
701 space_left -= 16;
702 first_tag = 0;
703 }
704
705 /* If there's no more to do, or if the descriptor is full,
706 let the IO rip! */
707
708 if (bufs == journal->j_wbufsize ||
709 commit_transaction->t_buffers == NULL ||
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400710 space_left < tag_bytes + 16 + csum_size) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700711
Eryu Guanf2a44522011-11-01 19:09:18 -0400712 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700713
714 /* Write an end-of-descriptor marker before
715 submitting the IOs. "tag" still points to
716 the last tag we set up. */
717
Darrick J. Wong8f888ef2012-05-22 22:43:41 -0400718 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700719
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400720 jbd2_descr_block_csum_set(journal, descriptor);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700721start_journal_io:
722 for (i = 0; i < bufs; i++) {
723 struct buffer_head *bh = wbuf[i];
Girish Shilamkar818d2762008-01-28 23:58:27 -0500724 /*
725 * Compute checksum.
726 */
727 if (JBD2_HAS_COMPAT_FEATURE(journal,
728 JBD2_FEATURE_COMPAT_CHECKSUM)) {
729 crc32_sum =
730 jbd2_checksum_data(crc32_sum, bh);
731 }
732
Dave Kleikamp470decc2006-10-11 01:20:57 -0700733 lock_buffer(bh);
734 clear_buffer_dirty(bh);
735 set_buffer_uptodate(bh);
736 bh->b_end_io = journal_end_buffer_io_sync;
Jens Axboe82f04ab2011-03-17 11:01:52 +0100737 submit_bh(WRITE_SYNC, bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700738 }
739 cond_resched();
Theodore Ts'obf699322009-09-30 00:32:06 -0400740 stats.run.rs_blocks_logged += bufs;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700741
742 /* Force a new descriptor to be generated next
743 time round the loop. */
744 descriptor = NULL;
745 bufs = 0;
746 }
747 }
748
Jan Karac851ed52008-07-11 19:27:31 -0400749 err = journal_finish_inode_data_buffers(journal, commit_transaction);
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400750 if (err) {
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400751 printk(KERN_WARNING
752 "JBD2: Detected IO errors while flushing file data "
Theodore Ts'o05496762008-09-16 14:36:17 -0400753 "on %s\n", journal->j_devname);
Hidehiro Kawai5bf56832008-10-10 22:12:43 -0400754 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
755 jbd2_journal_abort(journal, err);
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400756 err = 0;
757 }
Jan Karac851ed52008-07-11 19:27:31 -0400758
Jan Kara33395782012-03-13 22:45:38 -0400759 /*
760 * Get current oldest transaction in the log before we issue flush
761 * to the filesystem device. After the flush we can be sure that
762 * blocks of all older transactions are checkpointed to persistent
763 * storage and we will be safe to update journal start in the
764 * superblock with the numbers we get here.
765 */
766 update_tail =
767 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
768
Jan Karabbd2be32011-05-24 11:59:18 -0400769 write_lock(&journal->j_state_lock);
Jan Kara33395782012-03-13 22:45:38 -0400770 if (update_tail) {
771 long freed = first_block - journal->j_tail;
772
773 if (first_block < journal->j_tail)
774 freed += journal->j_last - journal->j_first;
775 /* Update tail only if we free significant amount of space */
776 if (freed < journal->j_maxlen / 4)
777 update_tail = 0;
778 }
Jan Karabbd2be32011-05-24 11:59:18 -0400779 J_ASSERT(commit_transaction->t_state == T_COMMIT);
780 commit_transaction->t_state = T_COMMIT_DFLUSH;
781 write_unlock(&journal->j_state_lock);
Jan Kara33395782012-03-13 22:45:38 -0400782
Girish Shilamkar818d2762008-01-28 23:58:27 -0500783 /*
784 * If the journal is not located on the file system device,
785 * then we must flush the file system device before we issue
786 * the commit record
787 */
Jan Kara81be12c2011-05-24 11:52:40 -0400788 if (commit_transaction->t_need_data_flush &&
Girish Shilamkar818d2762008-01-28 23:58:27 -0500789 (journal->j_fs_dev != journal->j_dev) &&
790 (journal->j_flags & JBD2_BARRIER))
Shaohua Li99aa7842012-04-13 10:27:35 +0800791 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500792
793 /* Done it all: now write the commit record asynchronously. */
794 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
795 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
796 err = journal_submit_commit_record(journal, commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700797 &cbh, crc32_sum);
798 if (err)
799 __jbd2_journal_abort_hard(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700800 }
801
Jens Axboe82f04ab2011-03-17 11:01:52 +0100802 blk_finish_plug(&plug);
803
Dave Kleikamp470decc2006-10-11 01:20:57 -0700804 /* Lo and behold: we have just managed to send a transaction to
805 the log. Before we can commit it, wait for the IO so far to
806 complete. Control buffers being written are on the
807 transaction's t_log_list queue, and metadata buffers are on
Jan Karaf5113ef2013-06-04 12:01:45 -0400808 the io_bufs list.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700809
810 Wait for the buffers in reverse order. That way we are
811 less likely to be woken up until all IOs have completed, and
812 so we incur less scheduling load.
813 */
814
Eryu Guanf2a44522011-11-01 19:09:18 -0400815 jbd_debug(3, "JBD2: commit phase 3\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700816
Jan Karaf5113ef2013-06-04 12:01:45 -0400817 while (!list_empty(&io_bufs)) {
818 struct buffer_head *bh = list_entry(io_bufs.prev,
819 struct buffer_head,
820 b_assoc_buffers);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700821
Jan Karaf5113ef2013-06-04 12:01:45 -0400822 wait_on_buffer(bh);
823 cond_resched();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700824
825 if (unlikely(!buffer_uptodate(bh)))
826 err = -EIO;
Jan Karaf5113ef2013-06-04 12:01:45 -0400827 jbd2_unfile_log_bh(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700828
829 /*
Jan Karaf5113ef2013-06-04 12:01:45 -0400830 * The list contains temporary buffer heads created by
831 * jbd2_journal_write_metadata_buffer().
Dave Kleikamp470decc2006-10-11 01:20:57 -0700832 */
833 BUFFER_TRACE(bh, "dumping temporary bh");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700834 __brelse(bh);
835 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
836 free_buffer_head(bh);
837
Jan Karaf5113ef2013-06-04 12:01:45 -0400838 /* We also have to refile the corresponding shadowed buffer */
Dave Kleikamp470decc2006-10-11 01:20:57 -0700839 jh = commit_transaction->t_shadow_list->b_tprev;
840 bh = jh2bh(jh);
Jan Karaf5113ef2013-06-04 12:01:45 -0400841 clear_buffer_jwrite(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700842 J_ASSERT_BH(bh, buffer_jbddirty(bh));
843
844 /* The metadata is now released for reuse, but we need
845 to remember it against this transaction so that when
846 we finally commit, we can do any checkpointing
847 required. */
848 JBUFFER_TRACE(jh, "file as BJ_Forget");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700849 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
Jan Kara229309c2011-05-08 19:09:53 -0400850 /*
851 * Wake up any transactions which were waiting for this IO to
852 * complete. The barrier must be here so that changes by
853 * jbd2_journal_file_buffer() take effect before wake_up_bit()
854 * does the waitqueue check.
855 */
856 smp_mb();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700857 wake_up_bit(&bh->b_state, BH_Unshadow);
858 JBUFFER_TRACE(jh, "brelse shadowed buffer");
859 __brelse(bh);
860 }
861
862 J_ASSERT (commit_transaction->t_shadow_list == NULL);
863
Eryu Guanf2a44522011-11-01 19:09:18 -0400864 jbd_debug(3, "JBD2: commit phase 4\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700865
866 /* Here we wait for the revoke record and descriptor record buffers */
867 wait_for_ctlbuf:
868 while (commit_transaction->t_log_list != NULL) {
869 struct buffer_head *bh;
870
871 jh = commit_transaction->t_log_list->b_tprev;
872 bh = jh2bh(jh);
873 if (buffer_locked(bh)) {
874 wait_on_buffer(bh);
875 goto wait_for_ctlbuf;
876 }
877 if (cond_resched())
878 goto wait_for_ctlbuf;
879
880 if (unlikely(!buffer_uptodate(bh)))
881 err = -EIO;
882
883 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
884 clear_buffer_jwrite(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700885 jbd2_journal_unfile_buffer(journal, jh);
886 jbd2_journal_put_journal_head(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700887 __brelse(bh); /* One for getblk */
888 /* AKPM: bforget here */
889 }
890
Hidehiro Kawai77e841d2008-10-12 16:39:16 -0400891 if (err)
892 jbd2_journal_abort(journal, err);
893
Eryu Guanf2a44522011-11-01 19:09:18 -0400894 jbd_debug(3, "JBD2: commit phase 5\n");
Jan Karabbd2be32011-05-24 11:59:18 -0400895 write_lock(&journal->j_state_lock);
896 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
897 commit_transaction->t_state = T_COMMIT_JFLUSH;
898 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700899
Girish Shilamkar818d2762008-01-28 23:58:27 -0500900 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
Theodore Ts'o0e3d2a62009-09-11 09:30:12 -0400901 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
Girish Shilamkar818d2762008-01-28 23:58:27 -0500902 err = journal_submit_commit_record(journal, commit_transaction,
903 &cbh, crc32_sum);
904 if (err)
905 __jbd2_journal_abort_hard(journal);
906 }
Zhang Huan6cba6112011-04-05 19:16:20 -0400907 if (cbh)
Theodore Ts'ofd984962009-01-05 21:34:13 -0500908 err = journal_wait_on_commit_record(journal, cbh);
Jan Karaf73bee42010-08-18 15:56:56 +0200909 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
910 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
911 journal->j_flags & JBD2_BARRIER) {
Shaohua Li99aa7842012-04-13 10:27:35 +0800912 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
Jan Karaf73bee42010-08-18 15:56:56 +0200913 }
Dave Kleikamp470decc2006-10-11 01:20:57 -0700914
915 if (err)
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400916 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700917
Jan Kara33395782012-03-13 22:45:38 -0400918 /*
919 * Now disk caches for filesystem device are flushed so we are safe to
920 * erase checkpointed transactions from the log by updating journal
921 * superblock.
922 */
923 if (update_tail)
924 jbd2_update_log_tail(journal, first_tid, first_block);
925
Dave Kleikamp470decc2006-10-11 01:20:57 -0700926 /* End of a transaction! Finally, we can do checkpoint
927 processing: any buffers committed as a result of this
928 transaction can be removed from any checkpoint list it was on
929 before. */
930
Eryu Guanf2a44522011-11-01 19:09:18 -0400931 jbd_debug(3, "JBD2: commit phase 6\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700932
Jan Karac851ed52008-07-11 19:27:31 -0400933 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
Dave Kleikamp470decc2006-10-11 01:20:57 -0700934 J_ASSERT(commit_transaction->t_buffers == NULL);
935 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700936 J_ASSERT(commit_transaction->t_shadow_list == NULL);
937 J_ASSERT(commit_transaction->t_log_list == NULL);
938
939restart_loop:
940 /*
941 * As there are other places (journal_unmap_buffer()) adding buffers
942 * to this list we have to be careful and hold the j_list_lock.
943 */
944 spin_lock(&journal->j_list_lock);
945 while (commit_transaction->t_forget) {
946 transaction_t *cp_transaction;
947 struct buffer_head *bh;
Jan Karade1b7942011-06-13 15:38:22 -0400948 int try_to_free = 0;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700949
950 jh = commit_transaction->t_forget;
951 spin_unlock(&journal->j_list_lock);
952 bh = jh2bh(jh);
Jan Karade1b7942011-06-13 15:38:22 -0400953 /*
954 * Get a reference so that bh cannot be freed before we are
955 * done with it.
956 */
957 get_bh(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700958 jbd_lock_bh_state(bh);
dingdinghua23e2af32010-02-24 12:11:20 -0500959 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700960
961 /*
962 * If there is undo-protected committed data against
963 * this buffer, then we can remove it now. If it is a
964 * buffer needing such protection, the old frozen_data
965 * field now points to a committed version of the
966 * buffer, so rotate that field to the new committed
967 * data.
968 *
969 * Otherwise, we can just throw away the frozen data now.
Joel Beckere06c8222008-09-11 15:35:47 -0700970 *
971 * We also know that the frozen data has already fired
972 * its triggers if they exist, so we can clear that too.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700973 */
974 if (jh->b_committed_data) {
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400975 jbd2_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700976 jh->b_committed_data = NULL;
977 if (jh->b_frozen_data) {
978 jh->b_committed_data = jh->b_frozen_data;
979 jh->b_frozen_data = NULL;
Joel Beckere06c8222008-09-11 15:35:47 -0700980 jh->b_frozen_triggers = NULL;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700981 }
982 } else if (jh->b_frozen_data) {
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400983 jbd2_free(jh->b_frozen_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700984 jh->b_frozen_data = NULL;
Joel Beckere06c8222008-09-11 15:35:47 -0700985 jh->b_frozen_triggers = NULL;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700986 }
987
988 spin_lock(&journal->j_list_lock);
989 cp_transaction = jh->b_cp_transaction;
990 if (cp_transaction) {
991 JBUFFER_TRACE(jh, "remove from old cp transaction");
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500992 cp_transaction->t_chp_stats.cs_dropped++;
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700993 __jbd2_journal_remove_checkpoint(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700994 }
995
996 /* Only re-checkpoint the buffer_head if it is marked
997 * dirty. If the buffer was added to the BJ_Forget list
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700998 * by jbd2_journal_forget, it may no longer be dirty and
Dave Kleikamp470decc2006-10-11 01:20:57 -0700999 * there's no point in keeping a checkpoint record for
1000 * it. */
1001
Jan Karab794e7a2012-09-26 23:11:13 -04001002 /*
1003 * A buffer which has been freed while still being journaled by
1004 * a previous transaction.
1005 */
1006 if (buffer_freed(bh)) {
1007 /*
1008 * If the running transaction is the one containing
1009 * "add to orphan" operation (b_next_transaction !=
1010 * NULL), we have to wait for that transaction to
1011 * commit before we can really get rid of the buffer.
1012 * So just clear b_modified to not confuse transaction
1013 * credit accounting and refile the buffer to
1014 * BJ_Forget of the running transaction. If the just
1015 * committed transaction contains "add to orphan"
1016 * operation, we can completely invalidate the buffer
1017 * now. We are rather through in that since the
1018 * buffer may be still accessible when blocksize <
1019 * pagesize and it is attached to the last partial
1020 * page.
1021 */
1022 jh->b_modified = 0;
1023 if (!jh->b_next_transaction) {
1024 clear_buffer_freed(bh);
1025 clear_buffer_jbddirty(bh);
1026 clear_buffer_mapped(bh);
1027 clear_buffer_new(bh);
1028 clear_buffer_req(bh);
1029 bh->b_bdev = NULL;
1030 }
Dave Kleikamp470decc2006-10-11 01:20:57 -07001031 }
1032
1033 if (buffer_jbddirty(bh)) {
1034 JBUFFER_TRACE(jh, "add to new checkpointing trans");
Mingming Caof7f4bcc2006-10-11 01:20:59 -07001035 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
Hidehiro Kawai7ad74452008-10-10 20:29:31 -04001036 if (is_journal_aborted(journal))
1037 clear_buffer_jbddirty(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001038 } else {
1039 J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Karade1b7942011-06-13 15:38:22 -04001040 /*
1041 * The buffer on BJ_Forget list and not jbddirty means
Dave Kleikamp470decc2006-10-11 01:20:57 -07001042 * it has been freed by this transaction and hence it
1043 * could not have been reallocated until this
1044 * transaction has committed. *BUT* it could be
1045 * reallocated once we have written all the data to
1046 * disk and before we process the buffer on BJ_Forget
Jan Karade1b7942011-06-13 15:38:22 -04001047 * list.
1048 */
1049 if (!jh->b_next_transaction)
1050 try_to_free = 1;
Dave Kleikamp470decc2006-10-11 01:20:57 -07001051 }
Jan Karade1b7942011-06-13 15:38:22 -04001052 JBUFFER_TRACE(jh, "refile or unfile buffer");
1053 __jbd2_journal_refile_buffer(jh);
1054 jbd_unlock_bh_state(bh);
1055 if (try_to_free)
1056 release_buffer_page(bh); /* Drops bh reference */
1057 else
1058 __brelse(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001059 cond_resched_lock(&journal->j_list_lock);
1060 }
1061 spin_unlock(&journal->j_list_lock);
1062 /*
Jan Karaf5a7a6b2008-01-28 23:58:27 -05001063 * This is a bit sleazy. We use j_list_lock to protect transition
1064 * of a transaction into T_FINISHED state and calling
1065 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1066 * other checkpointing code processing the transaction...
Dave Kleikamp470decc2006-10-11 01:20:57 -07001067 */
Theodore Ts'oa931da62010-08-03 21:35:12 -04001068 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001069 spin_lock(&journal->j_list_lock);
1070 /*
1071 * Now recheck if some buffers did not get attached to the transaction
1072 * while the lock was dropped...
1073 */
1074 if (commit_transaction->t_forget) {
1075 spin_unlock(&journal->j_list_lock);
Theodore Ts'oa931da62010-08-03 21:35:12 -04001076 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001077 goto restart_loop;
1078 }
1079
1080 /* Done with this transaction! */
1081
Eryu Guanf2a44522011-11-01 19:09:18 -04001082 jbd_debug(3, "JBD2: commit phase 7\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -07001083
Jan Karabbd2be32011-05-24 11:59:18 -04001084 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001085
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001086 commit_transaction->t_start = jiffies;
Theodore Ts'obf699322009-09-30 00:32:06 -04001087 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1088 commit_transaction->t_start);
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001089
1090 /*
Theodore Ts'obf699322009-09-30 00:32:06 -04001091 * File the transaction statistics
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001092 */
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001093 stats.ts_tid = commit_transaction->t_tid;
Theodore Ts'o8dd42042010-08-03 21:38:29 -04001094 stats.run.rs_handle_count =
1095 atomic_read(&commit_transaction->t_handle_count);
Theodore Ts'obf699322009-09-30 00:32:06 -04001096 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1097 commit_transaction->t_tid, &stats.run);
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001098
1099 /*
1100 * Calculate overall stats
1101 */
Theodore Ts'obf699322009-09-30 00:32:06 -04001102 spin_lock(&journal->j_history_lock);
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001103 journal->j_stats.ts_tid++;
Theodore Ts'o9fff24a2013-02-06 22:30:23 -05001104 if (commit_transaction->t_requested)
1105 journal->j_stats.ts_requested++;
Theodore Ts'obf699322009-09-30 00:32:06 -04001106 journal->j_stats.run.rs_wait += stats.run.rs_wait;
Theodore Ts'o9fff24a2013-02-06 22:30:23 -05001107 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
Theodore Ts'obf699322009-09-30 00:32:06 -04001108 journal->j_stats.run.rs_running += stats.run.rs_running;
1109 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1110 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1111 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1112 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1113 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1114 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001115 spin_unlock(&journal->j_history_lock);
1116
Dmitry Monakhov794446c2013-04-03 22:06:52 -04001117 commit_transaction->t_state = T_COMMIT_CALLBACK;
Dave Kleikamp470decc2006-10-11 01:20:57 -07001118 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1119 journal->j_commit_sequence = commit_transaction->t_tid;
1120 journal->j_committing_transaction = NULL;
Josef Bacike07f7182008-11-26 01:14:26 -05001121 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
Dave Kleikamp470decc2006-10-11 01:20:57 -07001122
Josef Bacike07f7182008-11-26 01:14:26 -05001123 /*
1124 * weight the commit time higher than the average time so we don't
1125 * react too strongly to vast changes in the commit time
1126 */
1127 if (likely(journal->j_average_commit_time))
1128 journal->j_average_commit_time = (commit_time +
1129 journal->j_average_commit_time*3) / 4;
1130 else
1131 journal->j_average_commit_time = commit_time;
Dmitry Monakhov794446c2013-04-03 22:06:52 -04001132
Theodore Ts'oa931da62010-08-03 21:35:12 -04001133 write_unlock(&journal->j_state_lock);
Theodore Ts'o6c20ec82008-10-28 21:08:20 -04001134
Dmitry Monakhov794446c2013-04-03 22:06:52 -04001135 if (journal->j_checkpoint_transactions == NULL) {
1136 journal->j_checkpoint_transactions = commit_transaction;
1137 commit_transaction->t_cpnext = commit_transaction;
1138 commit_transaction->t_cpprev = commit_transaction;
Dave Kleikamp470decc2006-10-11 01:20:57 -07001139 } else {
Dmitry Monakhov794446c2013-04-03 22:06:52 -04001140 commit_transaction->t_cpnext =
1141 journal->j_checkpoint_transactions;
1142 commit_transaction->t_cpprev =
1143 commit_transaction->t_cpnext->t_cpprev;
1144 commit_transaction->t_cpnext->t_cpprev =
1145 commit_transaction;
1146 commit_transaction->t_cpprev->t_cpnext =
Dave Kleikamp470decc2006-10-11 01:20:57 -07001147 commit_transaction;
Dave Kleikamp470decc2006-10-11 01:20:57 -07001148 }
1149 spin_unlock(&journal->j_list_lock);
Dmitry Monakhov794446c2013-04-03 22:06:52 -04001150 /* Drop all spin_locks because commit_callback may be block.
1151 * __journal_remove_checkpoint() can not destroy transaction
1152 * under us because it is not marked as T_FINISHED yet */
Aneesh Kumar K.Vfb684072008-11-06 17:50:21 -05001153 if (journal->j_commit_callback)
1154 journal->j_commit_callback(journal, commit_transaction);
1155
Theodore Ts'o879c5e62009-06-17 11:47:48 -04001156 trace_jbd2_end_commit(journal, commit_transaction);
Eryu Guanf2a44522011-11-01 19:09:18 -04001157 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
Dave Kleikamp470decc2006-10-11 01:20:57 -07001158 journal->j_commit_sequence, journal->j_tail_sequence);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001159
Dmitry Monakhov794446c2013-04-03 22:06:52 -04001160 write_lock(&journal->j_state_lock);
1161 spin_lock(&journal->j_list_lock);
1162 commit_transaction->t_state = T_FINISHED;
1163 /* Recheck checkpoint lists after j_list_lock was dropped */
1164 if (commit_transaction->t_checkpoint_list == NULL &&
1165 commit_transaction->t_checkpoint_io_list == NULL) {
1166 __jbd2_journal_drop_transaction(journal, commit_transaction);
1167 jbd2_journal_free_transaction(commit_transaction);
1168 }
1169 spin_unlock(&journal->j_list_lock);
1170 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001171 wake_up(&journal->j_wait_done_commit);
1172}