blob: 5d505cfd21c28b8458c837424fb3b4ef423538f6 [file] [log] [blame]
Dave Kleikamp470decc2006-10-11 01:20:57 -07001/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -07002 * linux/fs/jbd2/commit.c
Dave Kleikamp470decc2006-10-11 01:20:57 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
Mingming Caof7f4bcc2006-10-11 01:20:59 -070018#include <linux/jbd2.h>
Dave Kleikamp470decc2006-10-11 01:20:57 -070019#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
Johann Lombardi8e85fb32008-01-28 23:58:27 -050023#include <linux/jiffies.h>
Girish Shilamkar818d2762008-01-28 23:58:27 -050024#include <linux/crc32.h>
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -040025#include <linux/writeback.h>
26#include <linux/backing-dev.h>
Theodore Ts'ofd984962009-01-05 21:34:13 -050027#include <linux/bio.h>
Theodore Ts'o0e3d2a62009-09-11 09:30:12 -040028#include <linux/blkdev.h>
Brian King39e3ac22010-10-27 21:25:12 -040029#include <linux/bitops.h>
Theodore Ts'o879c5e62009-06-17 11:47:48 -040030#include <trace/events/jbd2.h>
Dave Kleikamp470decc2006-10-11 01:20:57 -070031
32/*
33 * Default IO end handler for temporary BJ_IO buffer_heads.
34 */
35static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36{
37 BUFFER_TRACE(bh, "");
38 if (uptodate)
39 set_buffer_uptodate(bh);
40 else
41 clear_buffer_uptodate(bh);
42 unlock_buffer(bh);
43}
44
45/*
Jan Kara87c89c22008-07-11 19:27:31 -040046 * When an ext4 file is truncated, it is possible that some pages are not
47 * successfully freed, because they are attached to a committing transaction.
Dave Kleikamp470decc2006-10-11 01:20:57 -070048 * After the transaction commits, these pages are left on the LRU, with no
49 * ->mapping, and with attached buffers. These pages are trivially reclaimable
50 * by the VM, but their apparent absence upsets the VM accounting, and it makes
51 * the numbers in /proc/meminfo look odd.
52 *
53 * So here, we have a buffer which has just come off the forget list. Look to
54 * see if we can strip all buffers from the backing page.
55 *
56 * Called under lock_journal(), and possibly under journal_datalist_lock. The
57 * caller provided us with a ref against the buffer, and we drop that here.
58 */
59static void release_buffer_page(struct buffer_head *bh)
60{
61 struct page *page;
62
63 if (buffer_dirty(bh))
64 goto nope;
65 if (atomic_read(&bh->b_count) != 1)
66 goto nope;
67 page = bh->b_page;
68 if (!page)
69 goto nope;
70 if (page->mapping)
71 goto nope;
72
73 /* OK, it's a truncated page */
Nick Piggin529ae9a2008-08-02 12:01:03 +020074 if (!trylock_page(page))
Dave Kleikamp470decc2006-10-11 01:20:57 -070075 goto nope;
76
77 page_cache_get(page);
78 __brelse(bh);
79 try_to_free_buffers(page);
80 unlock_page(page);
81 page_cache_release(page);
82 return;
83
84nope:
85 __brelse(bh);
86}
87
88/*
Girish Shilamkar818d2762008-01-28 23:58:27 -050089 * Done it all: now submit the commit record. We should have
Dave Kleikamp470decc2006-10-11 01:20:57 -070090 * cleaned up our previous buffers by now, so if we are in abort
91 * mode we can now just skip the rest of the journal write
92 * entirely.
93 *
94 * Returns 1 if the journal needs to be aborted or 0 on success
95 */
Girish Shilamkar818d2762008-01-28 23:58:27 -050096static int journal_submit_commit_record(journal_t *journal,
97 transaction_t *commit_transaction,
98 struct buffer_head **cbh,
99 __u32 crc32_sum)
Dave Kleikamp470decc2006-10-11 01:20:57 -0700100{
101 struct journal_head *descriptor;
Girish Shilamkar818d2762008-01-28 23:58:27 -0500102 struct commit_header *tmp;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700103 struct buffer_head *bh;
Girish Shilamkar818d2762008-01-28 23:58:27 -0500104 int ret;
Theodore Ts'o736603a2008-07-11 19:27:31 -0400105 struct timespec now = current_kernel_time();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700106
Zhang Huan6cba6112011-04-05 19:16:20 -0400107 *cbh = NULL;
108
Dave Kleikamp470decc2006-10-11 01:20:57 -0700109 if (is_journal_aborted(journal))
110 return 0;
111
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700112 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700113 if (!descriptor)
114 return 1;
115
116 bh = jh2bh(descriptor);
117
Girish Shilamkar818d2762008-01-28 23:58:27 -0500118 tmp = (struct commit_header *)bh->b_data;
119 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
120 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
121 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Theodore Ts'o736603a2008-07-11 19:27:31 -0400122 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
123 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500124
125 if (JBD2_HAS_COMPAT_FEATURE(journal,
126 JBD2_FEATURE_COMPAT_CHECKSUM)) {
127 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
128 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
129 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700130 }
131
Girish Shilamkar818d2762008-01-28 23:58:27 -0500132 JBUFFER_TRACE(descriptor, "submit commit block");
133 lock_buffer(bh);
Theodore Ts'o45a90bf2008-10-06 12:04:02 -0400134 clear_buffer_dirty(bh);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500135 set_buffer_uptodate(bh);
136 bh->b_end_io = journal_end_buffer_io_sync;
137
138 if (journal->j_flags & JBD2_BARRIER &&
Theodore Ts'o0e3d2a62009-09-11 09:30:12 -0400139 !JBD2_HAS_INCOMPAT_FEATURE(journal,
Christoph Hellwig9c355752010-08-18 05:29:17 -0400140 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
Jens Axboe721a9602011-03-09 11:56:30 +0100141 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
Christoph Hellwig9c355752010-08-18 05:29:17 -0400142 else
Jens Axboe721a9602011-03-09 11:56:30 +0100143 ret = submit_bh(WRITE_SYNC, bh);
Christoph Hellwig9c355752010-08-18 05:29:17 -0400144
Girish Shilamkar818d2762008-01-28 23:58:27 -0500145 *cbh = bh;
146 return ret;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700147}
148
Girish Shilamkar818d2762008-01-28 23:58:27 -0500149/*
150 * This function along with journal_submit_commit_record
151 * allows to write the commit record asynchronously.
152 */
Theodore Ts'ofd984962009-01-05 21:34:13 -0500153static int journal_wait_on_commit_record(journal_t *journal,
154 struct buffer_head *bh)
Girish Shilamkar818d2762008-01-28 23:58:27 -0500155{
156 int ret = 0;
157
158 clear_buffer_dirty(bh);
159 wait_on_buffer(bh);
160
161 if (unlikely(!buffer_uptodate(bh)))
162 ret = -EIO;
163 put_bh(bh); /* One for getblk() */
164 jbd2_journal_put_journal_head(bh2jh(bh));
165
166 return ret;
167}
168
169/*
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400170 * write the filemap data using writepage() address_space_operations.
171 * We don't do block allocation here even for delalloc. We don't
172 * use writepages() because with dealyed allocation we may be doing
173 * block allocation in writepages().
174 */
175static int journal_submit_inode_data_buffers(struct address_space *mapping)
176{
177 int ret;
178 struct writeback_control wbc = {
179 .sync_mode = WB_SYNC_ALL,
180 .nr_to_write = mapping->nrpages * 2,
181 .range_start = 0,
182 .range_end = i_size_read(mapping->host),
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400183 };
184
185 ret = generic_writepages(mapping, &wbc);
186 return ret;
187}
188
189/*
Jan Karac851ed52008-07-11 19:27:31 -0400190 * Submit all the data buffers of inode associated with the transaction to
191 * disk.
192 *
193 * We are in a committing transaction. Therefore no new inode can be added to
194 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
195 * operate on from being released while we write out pages.
196 */
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400197static int journal_submit_data_buffers(journal_t *journal,
Jan Karac851ed52008-07-11 19:27:31 -0400198 transaction_t *commit_transaction)
199{
200 struct jbd2_inode *jinode;
201 int err, ret = 0;
202 struct address_space *mapping;
203
204 spin_lock(&journal->j_list_lock);
205 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
206 mapping = jinode->i_vfs_inode->i_mapping;
Brian King39e3ac22010-10-27 21:25:12 -0400207 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
Jan Karac851ed52008-07-11 19:27:31 -0400208 spin_unlock(&journal->j_list_lock);
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400209 /*
210 * submit the inode data buffers. We use writepage
211 * instead of writepages. Because writepages can do
212 * block allocation with delalloc. We need to write
213 * only allocated blocks here.
214 */
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400215 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400216 err = journal_submit_inode_data_buffers(mapping);
Jan Karac851ed52008-07-11 19:27:31 -0400217 if (!ret)
218 ret = err;
219 spin_lock(&journal->j_list_lock);
220 J_ASSERT(jinode->i_transaction == commit_transaction);
Brian King39e3ac22010-10-27 21:25:12 -0400221 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 smp_mb__after_clear_bit();
Jan Karac851ed52008-07-11 19:27:31 -0400223 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
224 }
225 spin_unlock(&journal->j_list_lock);
226 return ret;
227}
228
229/*
230 * Wait for data submitted for writeout, refile inodes to proper
231 * transaction if needed.
232 *
233 */
234static int journal_finish_inode_data_buffers(journal_t *journal,
235 transaction_t *commit_transaction)
236{
237 struct jbd2_inode *jinode, *next_i;
238 int err, ret = 0;
239
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400240 /* For locking, see the comment in journal_submit_data_buffers() */
Jan Karac851ed52008-07-11 19:27:31 -0400241 spin_lock(&journal->j_list_lock);
242 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
Brian King39e3ac22010-10-27 21:25:12 -0400243 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
Jan Karac851ed52008-07-11 19:27:31 -0400244 spin_unlock(&journal->j_list_lock);
245 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400246 if (err) {
247 /*
248 * Because AS_EIO is cleared by
Christoph Hellwig94004ed2009-09-30 22:16:33 +0200249 * filemap_fdatawait_range(), set it again so
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400250 * that user process can get -EIO from fsync().
251 */
252 set_bit(AS_EIO,
253 &jinode->i_vfs_inode->i_mapping->flags);
254
255 if (!ret)
256 ret = err;
257 }
Jan Karac851ed52008-07-11 19:27:31 -0400258 spin_lock(&journal->j_list_lock);
Brian King39e3ac22010-10-27 21:25:12 -0400259 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 smp_mb__after_clear_bit();
Jan Karac851ed52008-07-11 19:27:31 -0400261 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
262 }
263
264 /* Now refile inode to proper lists */
265 list_for_each_entry_safe(jinode, next_i,
266 &commit_transaction->t_inode_list, i_list) {
267 list_del(&jinode->i_list);
268 if (jinode->i_next_transaction) {
269 jinode->i_transaction = jinode->i_next_transaction;
270 jinode->i_next_transaction = NULL;
271 list_add(&jinode->i_list,
272 &jinode->i_transaction->t_inode_list);
273 } else {
274 jinode->i_transaction = NULL;
275 }
276 }
277 spin_unlock(&journal->j_list_lock);
278
279 return ret;
280}
281
Girish Shilamkar818d2762008-01-28 23:58:27 -0500282static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
283{
284 struct page *page = bh->b_page;
285 char *addr;
286 __u32 checksum;
287
Cong Wang303a8f22011-11-25 23:14:31 +0800288 addr = kmap_atomic(page);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500289 checksum = crc32_be(crc32_sum,
290 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
Cong Wang303a8f22011-11-25 23:14:31 +0800291 kunmap_atomic(addr);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500292
293 return checksum;
294}
295
296static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
Mingming Cao18eba7a2006-10-11 01:21:13 -0700297 unsigned long long block)
Zach Brownb517bea2006-10-11 01:21:08 -0700298{
299 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
Mingming Caocd02ff02007-10-16 18:38:25 -0400300 if (tag_bytes > JBD2_TAG_SIZE32)
Zach Brownb517bea2006-10-11 01:21:08 -0700301 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302}
303
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400304static void jbd2_descr_block_csum_set(journal_t *j,
305 struct journal_head *descriptor)
306{
307 struct jbd2_journal_block_tail *tail;
308 __u32 csum;
309
310 if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
311 return;
312
313 tail = (struct jbd2_journal_block_tail *)
314 (jh2bh(descriptor)->b_data + j->j_blocksize -
315 sizeof(struct jbd2_journal_block_tail));
316 tail->t_checksum = 0;
317 csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
318 j->j_blocksize);
319 tail->t_checksum = cpu_to_be32(csum);
320}
321
Dave Kleikamp470decc2006-10-11 01:20:57 -0700322/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700323 * jbd2_journal_commit_transaction
Dave Kleikamp470decc2006-10-11 01:20:57 -0700324 *
325 * The primary function for committing a transaction to the log. This
326 * function is called by the journal thread to begin a complete commit.
327 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700328void jbd2_journal_commit_transaction(journal_t *journal)
Dave Kleikamp470decc2006-10-11 01:20:57 -0700329{
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500330 struct transaction_stats_s stats;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700331 transaction_t *commit_transaction;
332 struct journal_head *jh, *new_jh, *descriptor;
333 struct buffer_head **wbuf = journal->j_wbuf;
334 int bufs;
335 int flags;
336 int err;
Mingming Cao18eba7a2006-10-11 01:21:13 -0700337 unsigned long long blocknr;
Josef Bacike07f7182008-11-26 01:14:26 -0500338 ktime_t start_time;
339 u64 commit_time;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700340 char *tagp = NULL;
341 journal_header_t *header;
342 journal_block_tag_t *tag = NULL;
343 int space_left = 0;
344 int first_tag = 0;
345 int tag_flag;
Aneesh Kumar K.Vfb684072008-11-06 17:50:21 -0500346 int i, to_free = 0;
Zach Brownb517bea2006-10-11 01:21:08 -0700347 int tag_bytes = journal_tag_bytes(journal);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500348 struct buffer_head *cbh = NULL; /* For transactional checksums */
349 __u32 crc32_sum = ~0;
Jens Axboe82f04ab2011-03-17 11:01:52 +0100350 struct blk_plug plug;
Jan Kara33395782012-03-13 22:45:38 -0400351 /* Tail of the journal */
352 unsigned long first_block;
353 tid_t first_tid;
354 int update_tail;
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400355 int csum_size = 0;
356
357 if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
358 csum_size = sizeof(struct jbd2_journal_block_tail);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700359
360 /*
361 * First job: lock down the current transaction and wait for
362 * all outstanding updates to complete.
363 */
364
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700365 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
366 if (journal->j_flags & JBD2_FLUSHED) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700367 jbd_debug(3, "super block updated\n");
Jan Karaa78bb112012-03-13 15:43:04 -0400368 mutex_lock(&journal->j_checkpoint_mutex);
Jan Kara79feb522012-03-13 22:22:54 -0400369 /*
370 * We hold j_checkpoint_mutex so tail cannot change under us.
371 * We don't need any special data guarantees for writing sb
372 * since journal is empty and it is ok for write to be
373 * flushed only with transaction commit.
374 */
375 jbd2_journal_update_sb_log_tail(journal,
376 journal->j_tail_sequence,
377 journal->j_tail,
378 WRITE_SYNC);
Jan Karaa78bb112012-03-13 15:43:04 -0400379 mutex_unlock(&journal->j_checkpoint_mutex);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700380 } else {
381 jbd_debug(3, "superblock not updated\n");
382 }
383
384 J_ASSERT(journal->j_running_transaction != NULL);
385 J_ASSERT(journal->j_committing_transaction == NULL);
386
387 commit_transaction = journal->j_running_transaction;
388 J_ASSERT(commit_transaction->t_state == T_RUNNING);
389
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400390 trace_jbd2_start_commit(journal, commit_transaction);
Eryu Guanf2a44522011-11-01 19:09:18 -0400391 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
Dave Kleikamp470decc2006-10-11 01:20:57 -0700392 commit_transaction->t_tid);
393
Theodore Ts'oa931da62010-08-03 21:35:12 -0400394 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700395 commit_transaction->t_state = T_LOCKED;
396
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400397 trace_jbd2_commit_locking(journal, commit_transaction);
Theodore Ts'obf699322009-09-30 00:32:06 -0400398 stats.run.rs_wait = commit_transaction->t_max_wait;
399 stats.run.rs_locked = jiffies;
400 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
401 stats.run.rs_locked);
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500402
Dave Kleikamp470decc2006-10-11 01:20:57 -0700403 spin_lock(&commit_transaction->t_handle_lock);
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400404 while (atomic_read(&commit_transaction->t_updates)) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700405 DEFINE_WAIT(wait);
406
407 prepare_to_wait(&journal->j_wait_updates, &wait,
408 TASK_UNINTERRUPTIBLE);
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400409 if (atomic_read(&commit_transaction->t_updates)) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700410 spin_unlock(&commit_transaction->t_handle_lock);
Theodore Ts'oa931da62010-08-03 21:35:12 -0400411 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700412 schedule();
Theodore Ts'oa931da62010-08-03 21:35:12 -0400413 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700414 spin_lock(&commit_transaction->t_handle_lock);
415 }
416 finish_wait(&journal->j_wait_updates, &wait);
417 }
418 spin_unlock(&commit_transaction->t_handle_lock);
419
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400420 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
Dave Kleikamp470decc2006-10-11 01:20:57 -0700421 journal->j_max_transaction_buffers);
422
423 /*
424 * First thing we are allowed to do is to discard any remaining
425 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
426 * that there are no such buffers: if a large filesystem
427 * operation like a truncate needs to split itself over multiple
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700428 * transactions, then it may try to do a jbd2_journal_restart() while
Dave Kleikamp470decc2006-10-11 01:20:57 -0700429 * there are still BJ_Reserved buffers outstanding. These must
430 * be released cleanly from the current transaction.
431 *
432 * In this case, the filesystem must still reserve write access
433 * again before modifying the buffer in the new transaction, but
434 * we do not require it to remember exactly which old buffers it
435 * has reserved. This is consistent with the existing behaviour
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700436 * that multiple jbd2_journal_get_write_access() calls to the same
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300437 * buffer are perfectly permissible.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700438 */
439 while (commit_transaction->t_reserved_list) {
440 jh = commit_transaction->t_reserved_list;
441 JBUFFER_TRACE(jh, "reserved, unused: refile");
442 /*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700443 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
Dave Kleikamp470decc2006-10-11 01:20:57 -0700444 * leave undo-committed data.
445 */
446 if (jh->b_committed_data) {
447 struct buffer_head *bh = jh2bh(jh);
448
449 jbd_lock_bh_state(bh);
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400450 jbd2_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700451 jh->b_committed_data = NULL;
452 jbd_unlock_bh_state(bh);
453 }
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700454 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700455 }
456
457 /*
458 * Now try to drop any written-back buffers from the journal's
459 * checkpoint lists. We do this *before* commit because it potentially
460 * frees some memory
461 */
462 spin_lock(&journal->j_list_lock);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700463 __jbd2_journal_clean_checkpoint_list(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700464 spin_unlock(&journal->j_list_lock);
465
Eryu Guanf2a44522011-11-01 19:09:18 -0400466 jbd_debug(3, "JBD2: commit phase 1\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700467
468 /*
Yongqiang Yang1ba37262011-12-28 17:46:46 -0500469 * Clear revoked flag to reflect there is no revoked buffers
470 * in the next transaction which is going to be started.
471 */
472 jbd2_clear_buffer_revoked_flags(journal);
473
474 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700475 * Switch to a new revoke table.
476 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700477 jbd2_journal_switch_revoke_table(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700478
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400479 trace_jbd2_commit_flushing(journal, commit_transaction);
Theodore Ts'obf699322009-09-30 00:32:06 -0400480 stats.run.rs_flushing = jiffies;
481 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
482 stats.run.rs_flushing);
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500483
Dave Kleikamp470decc2006-10-11 01:20:57 -0700484 commit_transaction->t_state = T_FLUSH;
485 journal->j_committing_transaction = commit_transaction;
486 journal->j_running_transaction = NULL;
Josef Bacike07f7182008-11-26 01:14:26 -0500487 start_time = ktime_get();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700488 commit_transaction->t_log_start = journal->j_head;
489 wake_up(&journal->j_wait_transaction_locked);
Theodore Ts'oa931da62010-08-03 21:35:12 -0400490 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700491
Eryu Guanf2a44522011-11-01 19:09:18 -0400492 jbd_debug(3, "JBD2: commit phase 2\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700493
494 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700495 * Now start flushing things to disk, in the order they appear
496 * on the transaction lists. Data blocks go first.
497 */
Aneesh Kumar K.Vcd1aac32008-07-11 19:27:31 -0400498 err = journal_submit_data_buffers(journal, commit_transaction);
Jan Karac851ed52008-07-11 19:27:31 -0400499 if (err)
500 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700501
Jens Axboe82f04ab2011-03-17 11:01:52 +0100502 blk_start_plug(&plug);
Theodore Ts'o67c457a2009-04-14 07:50:56 -0400503 jbd2_journal_write_revoke_records(journal, commit_transaction,
Jens Axboe82f04ab2011-03-17 11:01:52 +0100504 WRITE_SYNC);
505 blk_finish_plug(&plug);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700506
Eryu Guanf2a44522011-11-01 19:09:18 -0400507 jbd_debug(3, "JBD2: commit phase 2\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700508
509 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700510 * Way to go: we have now written out all of the data for a
511 * transaction! Now comes the tricky part: we need to write out
512 * metadata. Loop over the transaction's entire buffer list:
513 */
Theodore Ts'oa931da62010-08-03 21:35:12 -0400514 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700515 commit_transaction->t_state = T_COMMIT;
Theodore Ts'oa931da62010-08-03 21:35:12 -0400516 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700517
Theodore Ts'o879c5e62009-06-17 11:47:48 -0400518 trace_jbd2_commit_logging(journal, commit_transaction);
Theodore Ts'obf699322009-09-30 00:32:06 -0400519 stats.run.rs_logging = jiffies;
520 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
521 stats.run.rs_logging);
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400522 stats.run.rs_blocks =
523 atomic_read(&commit_transaction->t_outstanding_credits);
Theodore Ts'obf699322009-09-30 00:32:06 -0400524 stats.run.rs_blocks_logged = 0;
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500525
Josef Bacik1dfc3222008-04-17 10:38:59 -0400526 J_ASSERT(commit_transaction->t_nr_buffers <=
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400527 atomic_read(&commit_transaction->t_outstanding_credits));
Josef Bacik1dfc3222008-04-17 10:38:59 -0400528
Jan Kara87c89c22008-07-11 19:27:31 -0400529 err = 0;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700530 descriptor = NULL;
531 bufs = 0;
Jens Axboe82f04ab2011-03-17 11:01:52 +0100532 blk_start_plug(&plug);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700533 while (commit_transaction->t_buffers) {
534
535 /* Find the next buffer to be journaled... */
536
537 jh = commit_transaction->t_buffers;
538
539 /* If we're in abort mode, we just un-journal the buffer and
Hidehiro Kawai7ad74452008-10-10 20:29:31 -0400540 release it. */
Dave Kleikamp470decc2006-10-11 01:20:57 -0700541
542 if (is_journal_aborted(journal)) {
Hidehiro Kawai7ad74452008-10-10 20:29:31 -0400543 clear_buffer_jbddirty(jh2bh(jh));
Dave Kleikamp470decc2006-10-11 01:20:57 -0700544 JBUFFER_TRACE(jh, "journal is aborting: refile");
Joel Beckere06c8222008-09-11 15:35:47 -0700545 jbd2_buffer_abort_trigger(jh,
546 jh->b_frozen_data ?
547 jh->b_frozen_triggers :
548 jh->b_triggers);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700549 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700550 /* If that was the last one, we need to clean up
551 * any descriptor buffers which may have been
552 * already allocated, even if we are now
553 * aborting. */
554 if (!commit_transaction->t_buffers)
555 goto start_journal_io;
556 continue;
557 }
558
559 /* Make sure we have a descriptor block in which to
560 record the metadata buffer. */
561
562 if (!descriptor) {
563 struct buffer_head *bh;
564
565 J_ASSERT (bufs == 0);
566
Eryu Guanf2a44522011-11-01 19:09:18 -0400567 jbd_debug(4, "JBD2: get descriptor\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700568
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700569 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700570 if (!descriptor) {
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400571 jbd2_journal_abort(journal, -EIO);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700572 continue;
573 }
574
575 bh = jh2bh(descriptor);
Eryu Guanf2a44522011-11-01 19:09:18 -0400576 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
Dave Kleikamp470decc2006-10-11 01:20:57 -0700577 (unsigned long long)bh->b_blocknr, bh->b_data);
578 header = (journal_header_t *)&bh->b_data[0];
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700579 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
580 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700581 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
582
583 tagp = &bh->b_data[sizeof(journal_header_t)];
584 space_left = bh->b_size - sizeof(journal_header_t);
585 first_tag = 1;
586 set_buffer_jwrite(bh);
587 set_buffer_dirty(bh);
588 wbuf[bufs++] = bh;
589
590 /* Record it so that we can wait for IO
591 completion later */
592 BUFFER_TRACE(bh, "ph3: file as descriptor");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700593 jbd2_journal_file_buffer(descriptor, commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700594 BJ_LogCtl);
595 }
596
597 /* Where is the buffer to be written? */
598
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700599 err = jbd2_journal_next_log_block(journal, &blocknr);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700600 /* If the block mapping failed, just abandon the buffer
601 and repeat this loop: we'll fall into the
602 refile-on-abort condition above. */
603 if (err) {
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400604 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700605 continue;
606 }
607
608 /*
609 * start_this_handle() uses t_outstanding_credits to determine
610 * the free space in the log, but this counter is changed
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700611 * by jbd2_journal_next_log_block() also.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700612 */
Theodore Ts'oa51dca92010-08-02 08:43:25 -0400613 atomic_dec(&commit_transaction->t_outstanding_credits);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700614
615 /* Bump b_count to prevent truncate from stumbling over
616 the shadowed buffer! @@@ This can go if we ever get
617 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
618 atomic_inc(&jh2bh(jh)->b_count);
619
620 /* Make a temporary IO buffer with which to write it out
621 (this will requeue both the metadata buffer and the
622 temporary IO buffer). new_bh goes on BJ_IO*/
623
624 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
625 /*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700626 * akpm: jbd2_journal_write_metadata_buffer() sets
Dave Kleikamp470decc2006-10-11 01:20:57 -0700627 * new_bh->b_transaction to commit_transaction.
628 * We need to clean this up before we release new_bh
629 * (which is of type BJ_IO)
630 */
631 JBUFFER_TRACE(jh, "ph3: write metadata");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700632 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700633 jh, &new_jh, blocknr);
Theodore Ts'oe6ec1162009-12-01 09:04:42 -0500634 if (flags < 0) {
635 jbd2_journal_abort(journal, flags);
636 continue;
637 }
Dave Kleikamp470decc2006-10-11 01:20:57 -0700638 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
639 wbuf[bufs++] = jh2bh(new_jh);
640
641 /* Record the new block's tag in the current descriptor
642 buffer */
643
644 tag_flag = 0;
645 if (flags & 1)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700646 tag_flag |= JBD2_FLAG_ESCAPE;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700647 if (!first_tag)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700648 tag_flag |= JBD2_FLAG_SAME_UUID;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700649
650 tag = (journal_block_tag_t *) tagp;
Zach Brownb517bea2006-10-11 01:21:08 -0700651 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
Darrick J. Wong8f888ef2012-05-22 22:43:41 -0400652 tag->t_flags = cpu_to_be16(tag_flag);
Zach Brownb517bea2006-10-11 01:21:08 -0700653 tagp += tag_bytes;
654 space_left -= tag_bytes;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700655
656 if (first_tag) {
657 memcpy (tagp, journal->j_uuid, 16);
658 tagp += 16;
659 space_left -= 16;
660 first_tag = 0;
661 }
662
663 /* If there's no more to do, or if the descriptor is full,
664 let the IO rip! */
665
666 if (bufs == journal->j_wbufsize ||
667 commit_transaction->t_buffers == NULL ||
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400668 space_left < tag_bytes + 16 + csum_size) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700669
Eryu Guanf2a44522011-11-01 19:09:18 -0400670 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700671
672 /* Write an end-of-descriptor marker before
673 submitting the IOs. "tag" still points to
674 the last tag we set up. */
675
Darrick J. Wong8f888ef2012-05-22 22:43:41 -0400676 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700677
Darrick J. Wong3caa4872012-05-27 08:10:22 -0400678 jbd2_descr_block_csum_set(journal, descriptor);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700679start_journal_io:
680 for (i = 0; i < bufs; i++) {
681 struct buffer_head *bh = wbuf[i];
Girish Shilamkar818d2762008-01-28 23:58:27 -0500682 /*
683 * Compute checksum.
684 */
685 if (JBD2_HAS_COMPAT_FEATURE(journal,
686 JBD2_FEATURE_COMPAT_CHECKSUM)) {
687 crc32_sum =
688 jbd2_checksum_data(crc32_sum, bh);
689 }
690
Dave Kleikamp470decc2006-10-11 01:20:57 -0700691 lock_buffer(bh);
692 clear_buffer_dirty(bh);
693 set_buffer_uptodate(bh);
694 bh->b_end_io = journal_end_buffer_io_sync;
Jens Axboe82f04ab2011-03-17 11:01:52 +0100695 submit_bh(WRITE_SYNC, bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700696 }
697 cond_resched();
Theodore Ts'obf699322009-09-30 00:32:06 -0400698 stats.run.rs_blocks_logged += bufs;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700699
700 /* Force a new descriptor to be generated next
701 time round the loop. */
702 descriptor = NULL;
703 bufs = 0;
704 }
705 }
706
Jan Karac851ed52008-07-11 19:27:31 -0400707 err = journal_finish_inode_data_buffers(journal, commit_transaction);
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400708 if (err) {
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400709 printk(KERN_WARNING
710 "JBD2: Detected IO errors while flushing file data "
Theodore Ts'o05496762008-09-16 14:36:17 -0400711 "on %s\n", journal->j_devname);
Hidehiro Kawai5bf56832008-10-10 22:12:43 -0400712 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
713 jbd2_journal_abort(journal, err);
Hidehiro Kawaie9e34f42008-07-31 22:26:04 -0400714 err = 0;
715 }
Jan Karac851ed52008-07-11 19:27:31 -0400716
Jan Kara33395782012-03-13 22:45:38 -0400717 /*
718 * Get current oldest transaction in the log before we issue flush
719 * to the filesystem device. After the flush we can be sure that
720 * blocks of all older transactions are checkpointed to persistent
721 * storage and we will be safe to update journal start in the
722 * superblock with the numbers we get here.
723 */
724 update_tail =
725 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
726
Jan Karabbd2be32011-05-24 11:59:18 -0400727 write_lock(&journal->j_state_lock);
Jan Kara33395782012-03-13 22:45:38 -0400728 if (update_tail) {
729 long freed = first_block - journal->j_tail;
730
731 if (first_block < journal->j_tail)
732 freed += journal->j_last - journal->j_first;
733 /* Update tail only if we free significant amount of space */
734 if (freed < journal->j_maxlen / 4)
735 update_tail = 0;
736 }
Jan Karabbd2be32011-05-24 11:59:18 -0400737 J_ASSERT(commit_transaction->t_state == T_COMMIT);
738 commit_transaction->t_state = T_COMMIT_DFLUSH;
739 write_unlock(&journal->j_state_lock);
Jan Kara33395782012-03-13 22:45:38 -0400740
Girish Shilamkar818d2762008-01-28 23:58:27 -0500741 /*
742 * If the journal is not located on the file system device,
743 * then we must flush the file system device before we issue
744 * the commit record
745 */
Jan Kara81be12c2011-05-24 11:52:40 -0400746 if (commit_transaction->t_need_data_flush &&
Girish Shilamkar818d2762008-01-28 23:58:27 -0500747 (journal->j_fs_dev != journal->j_dev) &&
748 (journal->j_flags & JBD2_BARRIER))
Shaohua Li99aa7842012-04-13 10:27:35 +0800749 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500750
751 /* Done it all: now write the commit record asynchronously. */
752 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
753 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
754 err = journal_submit_commit_record(journal, commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700755 &cbh, crc32_sum);
756 if (err)
757 __jbd2_journal_abort_hard(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700758 }
759
Jens Axboe82f04ab2011-03-17 11:01:52 +0100760 blk_finish_plug(&plug);
761
Dave Kleikamp470decc2006-10-11 01:20:57 -0700762 /* Lo and behold: we have just managed to send a transaction to
763 the log. Before we can commit it, wait for the IO so far to
764 complete. Control buffers being written are on the
765 transaction's t_log_list queue, and metadata buffers are on
766 the t_iobuf_list queue.
767
768 Wait for the buffers in reverse order. That way we are
769 less likely to be woken up until all IOs have completed, and
770 so we incur less scheduling load.
771 */
772
Eryu Guanf2a44522011-11-01 19:09:18 -0400773 jbd_debug(3, "JBD2: commit phase 3\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700774
775 /*
776 * akpm: these are BJ_IO, and j_list_lock is not needed.
777 * See __journal_try_to_free_buffer.
778 */
779wait_for_iobuf:
780 while (commit_transaction->t_iobuf_list != NULL) {
781 struct buffer_head *bh;
782
783 jh = commit_transaction->t_iobuf_list->b_tprev;
784 bh = jh2bh(jh);
785 if (buffer_locked(bh)) {
786 wait_on_buffer(bh);
787 goto wait_for_iobuf;
788 }
789 if (cond_resched())
790 goto wait_for_iobuf;
791
792 if (unlikely(!buffer_uptodate(bh)))
793 err = -EIO;
794
795 clear_buffer_jwrite(bh);
796
797 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700798 jbd2_journal_unfile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700799
800 /*
801 * ->t_iobuf_list should contain only dummy buffer_heads
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700802 * which were created by jbd2_journal_write_metadata_buffer().
Dave Kleikamp470decc2006-10-11 01:20:57 -0700803 */
804 BUFFER_TRACE(bh, "dumping temporary bh");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700805 jbd2_journal_put_journal_head(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700806 __brelse(bh);
807 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
808 free_buffer_head(bh);
809
810 /* We also have to unlock and free the corresponding
811 shadowed buffer */
812 jh = commit_transaction->t_shadow_list->b_tprev;
813 bh = jh2bh(jh);
814 clear_bit(BH_JWrite, &bh->b_state);
815 J_ASSERT_BH(bh, buffer_jbddirty(bh));
816
817 /* The metadata is now released for reuse, but we need
818 to remember it against this transaction so that when
819 we finally commit, we can do any checkpointing
820 required. */
821 JBUFFER_TRACE(jh, "file as BJ_Forget");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700822 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
Jan Kara229309c2011-05-08 19:09:53 -0400823 /*
824 * Wake up any transactions which were waiting for this IO to
825 * complete. The barrier must be here so that changes by
826 * jbd2_journal_file_buffer() take effect before wake_up_bit()
827 * does the waitqueue check.
828 */
829 smp_mb();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700830 wake_up_bit(&bh->b_state, BH_Unshadow);
831 JBUFFER_TRACE(jh, "brelse shadowed buffer");
832 __brelse(bh);
833 }
834
835 J_ASSERT (commit_transaction->t_shadow_list == NULL);
836
Eryu Guanf2a44522011-11-01 19:09:18 -0400837 jbd_debug(3, "JBD2: commit phase 4\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700838
839 /* Here we wait for the revoke record and descriptor record buffers */
840 wait_for_ctlbuf:
841 while (commit_transaction->t_log_list != NULL) {
842 struct buffer_head *bh;
843
844 jh = commit_transaction->t_log_list->b_tprev;
845 bh = jh2bh(jh);
846 if (buffer_locked(bh)) {
847 wait_on_buffer(bh);
848 goto wait_for_ctlbuf;
849 }
850 if (cond_resched())
851 goto wait_for_ctlbuf;
852
853 if (unlikely(!buffer_uptodate(bh)))
854 err = -EIO;
855
856 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
857 clear_buffer_jwrite(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700858 jbd2_journal_unfile_buffer(journal, jh);
859 jbd2_journal_put_journal_head(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700860 __brelse(bh); /* One for getblk */
861 /* AKPM: bforget here */
862 }
863
Hidehiro Kawai77e841d2008-10-12 16:39:16 -0400864 if (err)
865 jbd2_journal_abort(journal, err);
866
Eryu Guanf2a44522011-11-01 19:09:18 -0400867 jbd_debug(3, "JBD2: commit phase 5\n");
Jan Karabbd2be32011-05-24 11:59:18 -0400868 write_lock(&journal->j_state_lock);
869 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
870 commit_transaction->t_state = T_COMMIT_JFLUSH;
871 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700872
Girish Shilamkar818d2762008-01-28 23:58:27 -0500873 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
Theodore Ts'o0e3d2a62009-09-11 09:30:12 -0400874 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
Girish Shilamkar818d2762008-01-28 23:58:27 -0500875 err = journal_submit_commit_record(journal, commit_transaction,
876 &cbh, crc32_sum);
877 if (err)
878 __jbd2_journal_abort_hard(journal);
879 }
Zhang Huan6cba6112011-04-05 19:16:20 -0400880 if (cbh)
Theodore Ts'ofd984962009-01-05 21:34:13 -0500881 err = journal_wait_on_commit_record(journal, cbh);
Jan Karaf73bee42010-08-18 15:56:56 +0200882 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
883 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
884 journal->j_flags & JBD2_BARRIER) {
Shaohua Li99aa7842012-04-13 10:27:35 +0800885 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
Jan Karaf73bee42010-08-18 15:56:56 +0200886 }
Dave Kleikamp470decc2006-10-11 01:20:57 -0700887
888 if (err)
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400889 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700890
Jan Kara33395782012-03-13 22:45:38 -0400891 /*
892 * Now disk caches for filesystem device are flushed so we are safe to
893 * erase checkpointed transactions from the log by updating journal
894 * superblock.
895 */
896 if (update_tail)
897 jbd2_update_log_tail(journal, first_tid, first_block);
898
Dave Kleikamp470decc2006-10-11 01:20:57 -0700899 /* End of a transaction! Finally, we can do checkpoint
900 processing: any buffers committed as a result of this
901 transaction can be removed from any checkpoint list it was on
902 before. */
903
Eryu Guanf2a44522011-11-01 19:09:18 -0400904 jbd_debug(3, "JBD2: commit phase 6\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700905
Jan Karac851ed52008-07-11 19:27:31 -0400906 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
Dave Kleikamp470decc2006-10-11 01:20:57 -0700907 J_ASSERT(commit_transaction->t_buffers == NULL);
908 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
909 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
910 J_ASSERT(commit_transaction->t_shadow_list == NULL);
911 J_ASSERT(commit_transaction->t_log_list == NULL);
912
913restart_loop:
914 /*
915 * As there are other places (journal_unmap_buffer()) adding buffers
916 * to this list we have to be careful and hold the j_list_lock.
917 */
918 spin_lock(&journal->j_list_lock);
919 while (commit_transaction->t_forget) {
920 transaction_t *cp_transaction;
921 struct buffer_head *bh;
Jan Karade1b7942011-06-13 15:38:22 -0400922 int try_to_free = 0;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700923
924 jh = commit_transaction->t_forget;
925 spin_unlock(&journal->j_list_lock);
926 bh = jh2bh(jh);
Jan Karade1b7942011-06-13 15:38:22 -0400927 /*
928 * Get a reference so that bh cannot be freed before we are
929 * done with it.
930 */
931 get_bh(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700932 jbd_lock_bh_state(bh);
dingdinghua23e2af32010-02-24 12:11:20 -0500933 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700934
935 /*
936 * If there is undo-protected committed data against
937 * this buffer, then we can remove it now. If it is a
938 * buffer needing such protection, the old frozen_data
939 * field now points to a committed version of the
940 * buffer, so rotate that field to the new committed
941 * data.
942 *
943 * Otherwise, we can just throw away the frozen data now.
Joel Beckere06c8222008-09-11 15:35:47 -0700944 *
945 * We also know that the frozen data has already fired
946 * its triggers if they exist, so we can clear that too.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700947 */
948 if (jh->b_committed_data) {
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400949 jbd2_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700950 jh->b_committed_data = NULL;
951 if (jh->b_frozen_data) {
952 jh->b_committed_data = jh->b_frozen_data;
953 jh->b_frozen_data = NULL;
Joel Beckere06c8222008-09-11 15:35:47 -0700954 jh->b_frozen_triggers = NULL;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700955 }
956 } else if (jh->b_frozen_data) {
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400957 jbd2_free(jh->b_frozen_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700958 jh->b_frozen_data = NULL;
Joel Beckere06c8222008-09-11 15:35:47 -0700959 jh->b_frozen_triggers = NULL;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700960 }
961
962 spin_lock(&journal->j_list_lock);
963 cp_transaction = jh->b_cp_transaction;
964 if (cp_transaction) {
965 JBUFFER_TRACE(jh, "remove from old cp transaction");
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500966 cp_transaction->t_chp_stats.cs_dropped++;
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700967 __jbd2_journal_remove_checkpoint(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700968 }
969
970 /* Only re-checkpoint the buffer_head if it is marked
971 * dirty. If the buffer was added to the BJ_Forget list
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700972 * by jbd2_journal_forget, it may no longer be dirty and
Dave Kleikamp470decc2006-10-11 01:20:57 -0700973 * there's no point in keeping a checkpoint record for
974 * it. */
975
976 /* A buffer which has been freed while still being
977 * journaled by a previous transaction may end up still
978 * being dirty here, but we want to avoid writing back
dingdinghuaba869022010-02-15 16:35:42 -0500979 * that buffer in the future after the "add to orphan"
980 * operation been committed, That's not only a performance
981 * gain, it also stops aliasing problems if the buffer is
982 * left behind for writeback and gets reallocated for another
Dave Kleikamp470decc2006-10-11 01:20:57 -0700983 * use in a different page. */
dingdinghuaba869022010-02-15 16:35:42 -0500984 if (buffer_freed(bh) && !jh->b_next_transaction) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700985 clear_buffer_freed(bh);
986 clear_buffer_jbddirty(bh);
987 }
988
989 if (buffer_jbddirty(bh)) {
990 JBUFFER_TRACE(jh, "add to new checkpointing trans");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700991 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
Hidehiro Kawai7ad74452008-10-10 20:29:31 -0400992 if (is_journal_aborted(journal))
993 clear_buffer_jbddirty(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700994 } else {
995 J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Karade1b7942011-06-13 15:38:22 -0400996 /*
997 * The buffer on BJ_Forget list and not jbddirty means
Dave Kleikamp470decc2006-10-11 01:20:57 -0700998 * it has been freed by this transaction and hence it
999 * could not have been reallocated until this
1000 * transaction has committed. *BUT* it could be
1001 * reallocated once we have written all the data to
1002 * disk and before we process the buffer on BJ_Forget
Jan Karade1b7942011-06-13 15:38:22 -04001003 * list.
1004 */
1005 if (!jh->b_next_transaction)
1006 try_to_free = 1;
Dave Kleikamp470decc2006-10-11 01:20:57 -07001007 }
Jan Karade1b7942011-06-13 15:38:22 -04001008 JBUFFER_TRACE(jh, "refile or unfile buffer");
1009 __jbd2_journal_refile_buffer(jh);
1010 jbd_unlock_bh_state(bh);
1011 if (try_to_free)
1012 release_buffer_page(bh); /* Drops bh reference */
1013 else
1014 __brelse(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001015 cond_resched_lock(&journal->j_list_lock);
1016 }
1017 spin_unlock(&journal->j_list_lock);
1018 /*
Jan Karaf5a7a6b2008-01-28 23:58:27 -05001019 * This is a bit sleazy. We use j_list_lock to protect transition
1020 * of a transaction into T_FINISHED state and calling
1021 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1022 * other checkpointing code processing the transaction...
Dave Kleikamp470decc2006-10-11 01:20:57 -07001023 */
Theodore Ts'oa931da62010-08-03 21:35:12 -04001024 write_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001025 spin_lock(&journal->j_list_lock);
1026 /*
1027 * Now recheck if some buffers did not get attached to the transaction
1028 * while the lock was dropped...
1029 */
1030 if (commit_transaction->t_forget) {
1031 spin_unlock(&journal->j_list_lock);
Theodore Ts'oa931da62010-08-03 21:35:12 -04001032 write_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001033 goto restart_loop;
1034 }
1035
1036 /* Done with this transaction! */
1037
Eryu Guanf2a44522011-11-01 19:09:18 -04001038 jbd_debug(3, "JBD2: commit phase 7\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -07001039
Jan Karabbd2be32011-05-24 11:59:18 -04001040 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001041
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001042 commit_transaction->t_start = jiffies;
Theodore Ts'obf699322009-09-30 00:32:06 -04001043 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1044 commit_transaction->t_start);
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001045
1046 /*
Theodore Ts'obf699322009-09-30 00:32:06 -04001047 * File the transaction statistics
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001048 */
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001049 stats.ts_tid = commit_transaction->t_tid;
Theodore Ts'o8dd42042010-08-03 21:38:29 -04001050 stats.run.rs_handle_count =
1051 atomic_read(&commit_transaction->t_handle_count);
Theodore Ts'obf699322009-09-30 00:32:06 -04001052 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1053 commit_transaction->t_tid, &stats.run);
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001054
1055 /*
1056 * Calculate overall stats
1057 */
Theodore Ts'obf699322009-09-30 00:32:06 -04001058 spin_lock(&journal->j_history_lock);
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001059 journal->j_stats.ts_tid++;
Theodore Ts'obf699322009-09-30 00:32:06 -04001060 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1061 journal->j_stats.run.rs_running += stats.run.rs_running;
1062 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1063 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1064 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1065 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1066 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1067 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
Johann Lombardi8e85fb32008-01-28 23:58:27 -05001068 spin_unlock(&journal->j_history_lock);
1069
Dave Kleikamp470decc2006-10-11 01:20:57 -07001070 commit_transaction->t_state = T_FINISHED;
1071 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1072 journal->j_commit_sequence = commit_transaction->t_tid;
1073 journal->j_committing_transaction = NULL;
Josef Bacike07f7182008-11-26 01:14:26 -05001074 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
Dave Kleikamp470decc2006-10-11 01:20:57 -07001075
Josef Bacike07f7182008-11-26 01:14:26 -05001076 /*
1077 * weight the commit time higher than the average time so we don't
1078 * react too strongly to vast changes in the commit time
1079 */
1080 if (likely(journal->j_average_commit_time))
1081 journal->j_average_commit_time = (commit_time +
1082 journal->j_average_commit_time*3) / 4;
1083 else
1084 journal->j_average_commit_time = commit_time;
Theodore Ts'oa931da62010-08-03 21:35:12 -04001085 write_unlock(&journal->j_state_lock);
Theodore Ts'o6c20ec82008-10-28 21:08:20 -04001086
Jan Karaf89b7792007-07-15 23:37:20 -07001087 if (commit_transaction->t_checkpoint_list == NULL &&
1088 commit_transaction->t_checkpoint_io_list == NULL) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -07001089 __jbd2_journal_drop_transaction(journal, commit_transaction);
Aneesh Kumar K.Vfb684072008-11-06 17:50:21 -05001090 to_free = 1;
Dave Kleikamp470decc2006-10-11 01:20:57 -07001091 } else {
1092 if (journal->j_checkpoint_transactions == NULL) {
1093 journal->j_checkpoint_transactions = commit_transaction;
1094 commit_transaction->t_cpnext = commit_transaction;
1095 commit_transaction->t_cpprev = commit_transaction;
1096 } else {
1097 commit_transaction->t_cpnext =
1098 journal->j_checkpoint_transactions;
1099 commit_transaction->t_cpprev =
1100 commit_transaction->t_cpnext->t_cpprev;
1101 commit_transaction->t_cpnext->t_cpprev =
1102 commit_transaction;
1103 commit_transaction->t_cpprev->t_cpnext =
1104 commit_transaction;
1105 }
1106 }
1107 spin_unlock(&journal->j_list_lock);
1108
Aneesh Kumar K.Vfb684072008-11-06 17:50:21 -05001109 if (journal->j_commit_callback)
1110 journal->j_commit_callback(journal, commit_transaction);
1111
Theodore Ts'o879c5e62009-06-17 11:47:48 -04001112 trace_jbd2_end_commit(journal, commit_transaction);
Eryu Guanf2a44522011-11-01 19:09:18 -04001113 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
Dave Kleikamp470decc2006-10-11 01:20:57 -07001114 journal->j_commit_sequence, journal->j_tail_sequence);
Aneesh Kumar K.Vfb684072008-11-06 17:50:21 -05001115 if (to_free)
Yongqiang Yang0c2022e2012-02-20 17:53:02 -05001116 jbd2_journal_free_transaction(commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -07001117
1118 wake_up(&journal->j_wait_done_commit);
1119}