blob: 483183d15ed54212aaa9e81ddba9070d90eef9d5 [file] [log] [blame]
Dave Kleikamp470decc2006-10-11 01:20:57 -07001/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -07002 * linux/fs/jbd2/commit.c
Dave Kleikamp470decc2006-10-11 01:20:57 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
Mingming Caof7f4bcc2006-10-11 01:20:59 -070018#include <linux/jbd2.h>
Dave Kleikamp470decc2006-10-11 01:20:57 -070019#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
Johann Lombardi8e85fb32008-01-28 23:58:27 -050023#include <linux/jiffies.h>
Girish Shilamkar818d2762008-01-28 23:58:27 -050024#include <linux/crc32.h>
Dave Kleikamp470decc2006-10-11 01:20:57 -070025
26/*
27 * Default IO end handler for temporary BJ_IO buffer_heads.
28 */
29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30{
31 BUFFER_TRACE(bh, "");
32 if (uptodate)
33 set_buffer_uptodate(bh);
34 else
35 clear_buffer_uptodate(bh);
36 unlock_buffer(bh);
37}
38
39/*
Jan Kara87c89c22008-07-11 19:27:31 -040040 * When an ext4 file is truncated, it is possible that some pages are not
41 * successfully freed, because they are attached to a committing transaction.
Dave Kleikamp470decc2006-10-11 01:20:57 -070042 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
45 * the numbers in /proc/meminfo look odd.
46 *
47 * So here, we have a buffer which has just come off the forget list. Look to
48 * see if we can strip all buffers from the backing page.
49 *
50 * Called under lock_journal(), and possibly under journal_datalist_lock. The
51 * caller provided us with a ref against the buffer, and we drop that here.
52 */
53static void release_buffer_page(struct buffer_head *bh)
54{
55 struct page *page;
56
57 if (buffer_dirty(bh))
58 goto nope;
59 if (atomic_read(&bh->b_count) != 1)
60 goto nope;
61 page = bh->b_page;
62 if (!page)
63 goto nope;
64 if (page->mapping)
65 goto nope;
66
67 /* OK, it's a truncated page */
68 if (TestSetPageLocked(page))
69 goto nope;
70
71 page_cache_get(page);
72 __brelse(bh);
73 try_to_free_buffers(page);
74 unlock_page(page);
75 page_cache_release(page);
76 return;
77
78nope:
79 __brelse(bh);
80}
81
82/*
Girish Shilamkar818d2762008-01-28 23:58:27 -050083 * Done it all: now submit the commit record. We should have
Dave Kleikamp470decc2006-10-11 01:20:57 -070084 * cleaned up our previous buffers by now, so if we are in abort
85 * mode we can now just skip the rest of the journal write
86 * entirely.
87 *
88 * Returns 1 if the journal needs to be aborted or 0 on success
89 */
Girish Shilamkar818d2762008-01-28 23:58:27 -050090static int journal_submit_commit_record(journal_t *journal,
91 transaction_t *commit_transaction,
92 struct buffer_head **cbh,
93 __u32 crc32_sum)
Dave Kleikamp470decc2006-10-11 01:20:57 -070094{
95 struct journal_head *descriptor;
Girish Shilamkar818d2762008-01-28 23:58:27 -050096 struct commit_header *tmp;
Dave Kleikamp470decc2006-10-11 01:20:57 -070097 struct buffer_head *bh;
Girish Shilamkar818d2762008-01-28 23:58:27 -050098 int ret;
Dave Kleikamp470decc2006-10-11 01:20:57 -070099 int barrier_done = 0;
Theodore Ts'o736603a2008-07-11 19:27:31 -0400100 struct timespec now = current_kernel_time();
Dave Kleikamp470decc2006-10-11 01:20:57 -0700101
102 if (is_journal_aborted(journal))
103 return 0;
104
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700105 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700106 if (!descriptor)
107 return 1;
108
109 bh = jh2bh(descriptor);
110
Girish Shilamkar818d2762008-01-28 23:58:27 -0500111 tmp = (struct commit_header *)bh->b_data;
112 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
113 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
114 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Theodore Ts'o736603a2008-07-11 19:27:31 -0400115 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
116 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500117
118 if (JBD2_HAS_COMPAT_FEATURE(journal,
119 JBD2_FEATURE_COMPAT_CHECKSUM)) {
120 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
121 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
122 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700123 }
124
Girish Shilamkar818d2762008-01-28 23:58:27 -0500125 JBUFFER_TRACE(descriptor, "submit commit block");
126 lock_buffer(bh);
Aneesh Kumar K.Vc4b8e632008-02-05 10:55:26 -0500127 get_bh(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700128 set_buffer_dirty(bh);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500129 set_buffer_uptodate(bh);
130 bh->b_end_io = journal_end_buffer_io_sync;
131
132 if (journal->j_flags & JBD2_BARRIER &&
Aneesh Kumar K.V4d605172008-02-05 10:56:15 -0500133 !JBD2_HAS_INCOMPAT_FEATURE(journal,
Girish Shilamkar818d2762008-01-28 23:58:27 -0500134 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700135 set_buffer_ordered(bh);
136 barrier_done = 1;
137 }
Girish Shilamkar818d2762008-01-28 23:58:27 -0500138 ret = submit_bh(WRITE, bh);
Dave Kleikampc4e35e072008-02-10 01:09:32 -0500139 if (barrier_done)
140 clear_buffer_ordered(bh);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500141
Dave Kleikamp470decc2006-10-11 01:20:57 -0700142 /* is it possible for another commit to fail at roughly
143 * the same time as this one? If so, we don't want to
144 * trust the barrier flag in the super, but instead want
145 * to remember if we sent a barrier request
146 */
147 if (ret == -EOPNOTSUPP && barrier_done) {
148 char b[BDEVNAME_SIZE];
149
150 printk(KERN_WARNING
151 "JBD: barrier-based sync failed on %s - "
152 "disabling barriers\n",
153 bdevname(journal->j_dev, b));
154 spin_lock(&journal->j_state_lock);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700155 journal->j_flags &= ~JBD2_BARRIER;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700156 spin_unlock(&journal->j_state_lock);
157
158 /* And try again, without the barrier */
Theodore Ts'o034772b2008-06-03 22:31:11 -0400159 lock_buffer(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700160 set_buffer_uptodate(bh);
161 set_buffer_dirty(bh);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500162 ret = submit_bh(WRITE, bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700163 }
Girish Shilamkar818d2762008-01-28 23:58:27 -0500164 *cbh = bh;
165 return ret;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700166}
167
Girish Shilamkar818d2762008-01-28 23:58:27 -0500168/*
169 * This function along with journal_submit_commit_record
170 * allows to write the commit record asynchronously.
171 */
172static int journal_wait_on_commit_record(struct buffer_head *bh)
173{
174 int ret = 0;
175
176 clear_buffer_dirty(bh);
177 wait_on_buffer(bh);
178
179 if (unlikely(!buffer_uptodate(bh)))
180 ret = -EIO;
181 put_bh(bh); /* One for getblk() */
182 jbd2_journal_put_journal_head(bh2jh(bh));
183
184 return ret;
185}
186
187/*
Jan Karac851ed52008-07-11 19:27:31 -0400188 * Submit all the data buffers of inode associated with the transaction to
189 * disk.
190 *
191 * We are in a committing transaction. Therefore no new inode can be added to
192 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
193 * operate on from being released while we write out pages.
194 */
195static int journal_submit_inode_data_buffers(journal_t *journal,
196 transaction_t *commit_transaction)
197{
198 struct jbd2_inode *jinode;
199 int err, ret = 0;
200 struct address_space *mapping;
201
202 spin_lock(&journal->j_list_lock);
203 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
204 mapping = jinode->i_vfs_inode->i_mapping;
205 jinode->i_flags |= JI_COMMIT_RUNNING;
206 spin_unlock(&journal->j_list_lock);
207 err = filemap_fdatawrite_range(mapping, 0,
208 i_size_read(jinode->i_vfs_inode));
209 if (!ret)
210 ret = err;
211 spin_lock(&journal->j_list_lock);
212 J_ASSERT(jinode->i_transaction == commit_transaction);
213 jinode->i_flags &= ~JI_COMMIT_RUNNING;
214 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
215 }
216 spin_unlock(&journal->j_list_lock);
217 return ret;
218}
219
220/*
221 * Wait for data submitted for writeout, refile inodes to proper
222 * transaction if needed.
223 *
224 */
225static int journal_finish_inode_data_buffers(journal_t *journal,
226 transaction_t *commit_transaction)
227{
228 struct jbd2_inode *jinode, *next_i;
229 int err, ret = 0;
230
231 /* For locking, see the comment in journal_submit_inode_data_buffers() */
232 spin_lock(&journal->j_list_lock);
233 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
234 jinode->i_flags |= JI_COMMIT_RUNNING;
235 spin_unlock(&journal->j_list_lock);
236 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
237 if (!ret)
238 ret = err;
239 spin_lock(&journal->j_list_lock);
240 jinode->i_flags &= ~JI_COMMIT_RUNNING;
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
242 }
243
244 /* Now refile inode to proper lists */
245 list_for_each_entry_safe(jinode, next_i,
246 &commit_transaction->t_inode_list, i_list) {
247 list_del(&jinode->i_list);
248 if (jinode->i_next_transaction) {
249 jinode->i_transaction = jinode->i_next_transaction;
250 jinode->i_next_transaction = NULL;
251 list_add(&jinode->i_list,
252 &jinode->i_transaction->t_inode_list);
253 } else {
254 jinode->i_transaction = NULL;
255 }
256 }
257 spin_unlock(&journal->j_list_lock);
258
259 return ret;
260}
261
Girish Shilamkar818d2762008-01-28 23:58:27 -0500262static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
263{
264 struct page *page = bh->b_page;
265 char *addr;
266 __u32 checksum;
267
268 addr = kmap_atomic(page, KM_USER0);
269 checksum = crc32_be(crc32_sum,
270 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
271 kunmap_atomic(addr, KM_USER0);
272
273 return checksum;
274}
275
276static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
Mingming Cao18eba7a2006-10-11 01:21:13 -0700277 unsigned long long block)
Zach Brownb517bea2006-10-11 01:21:08 -0700278{
279 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
Mingming Caocd02ff02007-10-16 18:38:25 -0400280 if (tag_bytes > JBD2_TAG_SIZE32)
Zach Brownb517bea2006-10-11 01:21:08 -0700281 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
282}
283
Dave Kleikamp470decc2006-10-11 01:20:57 -0700284/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700285 * jbd2_journal_commit_transaction
Dave Kleikamp470decc2006-10-11 01:20:57 -0700286 *
287 * The primary function for committing a transaction to the log. This
288 * function is called by the journal thread to begin a complete commit.
289 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700290void jbd2_journal_commit_transaction(journal_t *journal)
Dave Kleikamp470decc2006-10-11 01:20:57 -0700291{
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500292 struct transaction_stats_s stats;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700293 transaction_t *commit_transaction;
294 struct journal_head *jh, *new_jh, *descriptor;
295 struct buffer_head **wbuf = journal->j_wbuf;
296 int bufs;
297 int flags;
298 int err;
Mingming Cao18eba7a2006-10-11 01:21:13 -0700299 unsigned long long blocknr;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700300 char *tagp = NULL;
301 journal_header_t *header;
302 journal_block_tag_t *tag = NULL;
303 int space_left = 0;
304 int first_tag = 0;
305 int tag_flag;
306 int i;
Zach Brownb517bea2006-10-11 01:21:08 -0700307 int tag_bytes = journal_tag_bytes(journal);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500308 struct buffer_head *cbh = NULL; /* For transactional checksums */
309 __u32 crc32_sum = ~0;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700310
311 /*
312 * First job: lock down the current transaction and wait for
313 * all outstanding updates to complete.
314 */
315
316#ifdef COMMIT_STATS
317 spin_lock(&journal->j_list_lock);
318 summarise_journal_usage(journal);
319 spin_unlock(&journal->j_list_lock);
320#endif
321
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700322 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
323 if (journal->j_flags & JBD2_FLUSHED) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700324 jbd_debug(3, "super block updated\n");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700325 jbd2_journal_update_superblock(journal, 1);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700326 } else {
327 jbd_debug(3, "superblock not updated\n");
328 }
329
330 J_ASSERT(journal->j_running_transaction != NULL);
331 J_ASSERT(journal->j_committing_transaction == NULL);
332
333 commit_transaction = journal->j_running_transaction;
334 J_ASSERT(commit_transaction->t_state == T_RUNNING);
335
336 jbd_debug(1, "JBD: starting commit of transaction %d\n",
337 commit_transaction->t_tid);
338
339 spin_lock(&journal->j_state_lock);
340 commit_transaction->t_state = T_LOCKED;
341
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500342 stats.u.run.rs_wait = commit_transaction->t_max_wait;
343 stats.u.run.rs_locked = jiffies;
344 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
345 stats.u.run.rs_locked);
346
Dave Kleikamp470decc2006-10-11 01:20:57 -0700347 spin_lock(&commit_transaction->t_handle_lock);
348 while (commit_transaction->t_updates) {
349 DEFINE_WAIT(wait);
350
351 prepare_to_wait(&journal->j_wait_updates, &wait,
352 TASK_UNINTERRUPTIBLE);
353 if (commit_transaction->t_updates) {
354 spin_unlock(&commit_transaction->t_handle_lock);
355 spin_unlock(&journal->j_state_lock);
356 schedule();
357 spin_lock(&journal->j_state_lock);
358 spin_lock(&commit_transaction->t_handle_lock);
359 }
360 finish_wait(&journal->j_wait_updates, &wait);
361 }
362 spin_unlock(&commit_transaction->t_handle_lock);
363
364 J_ASSERT (commit_transaction->t_outstanding_credits <=
365 journal->j_max_transaction_buffers);
366
367 /*
368 * First thing we are allowed to do is to discard any remaining
369 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
370 * that there are no such buffers: if a large filesystem
371 * operation like a truncate needs to split itself over multiple
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700372 * transactions, then it may try to do a jbd2_journal_restart() while
Dave Kleikamp470decc2006-10-11 01:20:57 -0700373 * there are still BJ_Reserved buffers outstanding. These must
374 * be released cleanly from the current transaction.
375 *
376 * In this case, the filesystem must still reserve write access
377 * again before modifying the buffer in the new transaction, but
378 * we do not require it to remember exactly which old buffers it
379 * has reserved. This is consistent with the existing behaviour
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700380 * that multiple jbd2_journal_get_write_access() calls to the same
Dave Kleikamp470decc2006-10-11 01:20:57 -0700381 * buffer are perfectly permissable.
382 */
383 while (commit_transaction->t_reserved_list) {
384 jh = commit_transaction->t_reserved_list;
385 JBUFFER_TRACE(jh, "reserved, unused: refile");
386 /*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700387 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
Dave Kleikamp470decc2006-10-11 01:20:57 -0700388 * leave undo-committed data.
389 */
390 if (jh->b_committed_data) {
391 struct buffer_head *bh = jh2bh(jh);
392
393 jbd_lock_bh_state(bh);
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400394 jbd2_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700395 jh->b_committed_data = NULL;
396 jbd_unlock_bh_state(bh);
397 }
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700398 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700399 }
400
401 /*
402 * Now try to drop any written-back buffers from the journal's
403 * checkpoint lists. We do this *before* commit because it potentially
404 * frees some memory
405 */
406 spin_lock(&journal->j_list_lock);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700407 __jbd2_journal_clean_checkpoint_list(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700408 spin_unlock(&journal->j_list_lock);
409
410 jbd_debug (3, "JBD: commit phase 1\n");
411
412 /*
413 * Switch to a new revoke table.
414 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700415 jbd2_journal_switch_revoke_table(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700416
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500417 stats.u.run.rs_flushing = jiffies;
418 stats.u.run.rs_locked = jbd2_time_diff(stats.u.run.rs_locked,
419 stats.u.run.rs_flushing);
420
Dave Kleikamp470decc2006-10-11 01:20:57 -0700421 commit_transaction->t_state = T_FLUSH;
422 journal->j_committing_transaction = commit_transaction;
423 journal->j_running_transaction = NULL;
424 commit_transaction->t_log_start = journal->j_head;
425 wake_up(&journal->j_wait_transaction_locked);
426 spin_unlock(&journal->j_state_lock);
427
428 jbd_debug (3, "JBD: commit phase 2\n");
429
430 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700431 * Now start flushing things to disk, in the order they appear
432 * on the transaction lists. Data blocks go first.
433 */
Jan Karac851ed52008-07-11 19:27:31 -0400434 err = journal_submit_inode_data_buffers(journal, commit_transaction);
435 if (err)
436 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700437
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700438 jbd2_journal_write_revoke_records(journal, commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700439
440 jbd_debug(3, "JBD: commit phase 2\n");
441
442 /*
Dave Kleikamp470decc2006-10-11 01:20:57 -0700443 * Way to go: we have now written out all of the data for a
444 * transaction! Now comes the tricky part: we need to write out
445 * metadata. Loop over the transaction's entire buffer list:
446 */
Mingming Cao02c471c2008-05-15 14:46:17 -0400447 spin_lock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700448 commit_transaction->t_state = T_COMMIT;
Mingming Cao02c471c2008-05-15 14:46:17 -0400449 spin_unlock(&journal->j_state_lock);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700450
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500451 stats.u.run.rs_logging = jiffies;
452 stats.u.run.rs_flushing = jbd2_time_diff(stats.u.run.rs_flushing,
453 stats.u.run.rs_logging);
454 stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits;
455 stats.u.run.rs_blocks_logged = 0;
456
Josef Bacik1dfc3222008-04-17 10:38:59 -0400457 J_ASSERT(commit_transaction->t_nr_buffers <=
458 commit_transaction->t_outstanding_credits);
459
Jan Kara87c89c22008-07-11 19:27:31 -0400460 err = 0;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700461 descriptor = NULL;
462 bufs = 0;
463 while (commit_transaction->t_buffers) {
464
465 /* Find the next buffer to be journaled... */
466
467 jh = commit_transaction->t_buffers;
468
469 /* If we're in abort mode, we just un-journal the buffer and
470 release it for background writing. */
471
472 if (is_journal_aborted(journal)) {
473 JBUFFER_TRACE(jh, "journal is aborting: refile");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700474 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700475 /* If that was the last one, we need to clean up
476 * any descriptor buffers which may have been
477 * already allocated, even if we are now
478 * aborting. */
479 if (!commit_transaction->t_buffers)
480 goto start_journal_io;
481 continue;
482 }
483
484 /* Make sure we have a descriptor block in which to
485 record the metadata buffer. */
486
487 if (!descriptor) {
488 struct buffer_head *bh;
489
490 J_ASSERT (bufs == 0);
491
492 jbd_debug(4, "JBD: get descriptor\n");
493
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700494 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700495 if (!descriptor) {
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400496 jbd2_journal_abort(journal, -EIO);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700497 continue;
498 }
499
500 bh = jh2bh(descriptor);
501 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
502 (unsigned long long)bh->b_blocknr, bh->b_data);
503 header = (journal_header_t *)&bh->b_data[0];
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700504 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
505 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700506 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
507
508 tagp = &bh->b_data[sizeof(journal_header_t)];
509 space_left = bh->b_size - sizeof(journal_header_t);
510 first_tag = 1;
511 set_buffer_jwrite(bh);
512 set_buffer_dirty(bh);
513 wbuf[bufs++] = bh;
514
515 /* Record it so that we can wait for IO
516 completion later */
517 BUFFER_TRACE(bh, "ph3: file as descriptor");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700518 jbd2_journal_file_buffer(descriptor, commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700519 BJ_LogCtl);
520 }
521
522 /* Where is the buffer to be written? */
523
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700524 err = jbd2_journal_next_log_block(journal, &blocknr);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700525 /* If the block mapping failed, just abandon the buffer
526 and repeat this loop: we'll fall into the
527 refile-on-abort condition above. */
528 if (err) {
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400529 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700530 continue;
531 }
532
533 /*
534 * start_this_handle() uses t_outstanding_credits to determine
535 * the free space in the log, but this counter is changed
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700536 * by jbd2_journal_next_log_block() also.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700537 */
538 commit_transaction->t_outstanding_credits--;
539
540 /* Bump b_count to prevent truncate from stumbling over
541 the shadowed buffer! @@@ This can go if we ever get
542 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
543 atomic_inc(&jh2bh(jh)->b_count);
544
545 /* Make a temporary IO buffer with which to write it out
546 (this will requeue both the metadata buffer and the
547 temporary IO buffer). new_bh goes on BJ_IO*/
548
549 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
550 /*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700551 * akpm: jbd2_journal_write_metadata_buffer() sets
Dave Kleikamp470decc2006-10-11 01:20:57 -0700552 * new_bh->b_transaction to commit_transaction.
553 * We need to clean this up before we release new_bh
554 * (which is of type BJ_IO)
555 */
556 JBUFFER_TRACE(jh, "ph3: write metadata");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700557 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700558 jh, &new_jh, blocknr);
559 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
560 wbuf[bufs++] = jh2bh(new_jh);
561
562 /* Record the new block's tag in the current descriptor
563 buffer */
564
565 tag_flag = 0;
566 if (flags & 1)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700567 tag_flag |= JBD2_FLAG_ESCAPE;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700568 if (!first_tag)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700569 tag_flag |= JBD2_FLAG_SAME_UUID;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700570
571 tag = (journal_block_tag_t *) tagp;
Zach Brownb517bea2006-10-11 01:21:08 -0700572 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700573 tag->t_flags = cpu_to_be32(tag_flag);
Zach Brownb517bea2006-10-11 01:21:08 -0700574 tagp += tag_bytes;
575 space_left -= tag_bytes;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700576
577 if (first_tag) {
578 memcpy (tagp, journal->j_uuid, 16);
579 tagp += 16;
580 space_left -= 16;
581 first_tag = 0;
582 }
583
584 /* If there's no more to do, or if the descriptor is full,
585 let the IO rip! */
586
587 if (bufs == journal->j_wbufsize ||
588 commit_transaction->t_buffers == NULL ||
Zach Brownb517bea2006-10-11 01:21:08 -0700589 space_left < tag_bytes + 16) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700590
591 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
592
593 /* Write an end-of-descriptor marker before
594 submitting the IOs. "tag" still points to
595 the last tag we set up. */
596
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700597 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700598
599start_journal_io:
600 for (i = 0; i < bufs; i++) {
601 struct buffer_head *bh = wbuf[i];
Girish Shilamkar818d2762008-01-28 23:58:27 -0500602 /*
603 * Compute checksum.
604 */
605 if (JBD2_HAS_COMPAT_FEATURE(journal,
606 JBD2_FEATURE_COMPAT_CHECKSUM)) {
607 crc32_sum =
608 jbd2_checksum_data(crc32_sum, bh);
609 }
610
Dave Kleikamp470decc2006-10-11 01:20:57 -0700611 lock_buffer(bh);
612 clear_buffer_dirty(bh);
613 set_buffer_uptodate(bh);
614 bh->b_end_io = journal_end_buffer_io_sync;
615 submit_bh(WRITE, bh);
616 }
617 cond_resched();
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500618 stats.u.run.rs_blocks_logged += bufs;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700619
620 /* Force a new descriptor to be generated next
621 time round the loop. */
622 descriptor = NULL;
623 bufs = 0;
624 }
625 }
626
Girish Shilamkar818d2762008-01-28 23:58:27 -0500627 /* Done it all: now write the commit record asynchronously. */
628
629 if (JBD2_HAS_INCOMPAT_FEATURE(journal,
630 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
631 err = journal_submit_commit_record(journal, commit_transaction,
632 &cbh, crc32_sum);
633 if (err)
634 __jbd2_journal_abort_hard(journal);
Girish Shilamkar818d2762008-01-28 23:58:27 -0500635 }
636
Jan Karac851ed52008-07-11 19:27:31 -0400637 /*
638 * This is the right place to wait for data buffers both for ASYNC
639 * and !ASYNC commit. If commit is ASYNC, we need to wait only after
640 * the commit block went to disk (which happens above). If commit is
641 * SYNC, we need to wait for data buffers before we start writing
642 * commit block, which happens below in such setting.
643 */
644 err = journal_finish_inode_data_buffers(journal, commit_transaction);
645 if (err)
646 jbd2_journal_abort(journal, err);
647
Dave Kleikamp470decc2006-10-11 01:20:57 -0700648 /* Lo and behold: we have just managed to send a transaction to
649 the log. Before we can commit it, wait for the IO so far to
650 complete. Control buffers being written are on the
651 transaction's t_log_list queue, and metadata buffers are on
652 the t_iobuf_list queue.
653
654 Wait for the buffers in reverse order. That way we are
655 less likely to be woken up until all IOs have completed, and
656 so we incur less scheduling load.
657 */
658
Jan Kara87c89c22008-07-11 19:27:31 -0400659 jbd_debug(3, "JBD: commit phase 3\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700660
661 /*
662 * akpm: these are BJ_IO, and j_list_lock is not needed.
663 * See __journal_try_to_free_buffer.
664 */
665wait_for_iobuf:
666 while (commit_transaction->t_iobuf_list != NULL) {
667 struct buffer_head *bh;
668
669 jh = commit_transaction->t_iobuf_list->b_tprev;
670 bh = jh2bh(jh);
671 if (buffer_locked(bh)) {
672 wait_on_buffer(bh);
673 goto wait_for_iobuf;
674 }
675 if (cond_resched())
676 goto wait_for_iobuf;
677
678 if (unlikely(!buffer_uptodate(bh)))
679 err = -EIO;
680
681 clear_buffer_jwrite(bh);
682
683 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700684 jbd2_journal_unfile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700685
686 /*
687 * ->t_iobuf_list should contain only dummy buffer_heads
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700688 * which were created by jbd2_journal_write_metadata_buffer().
Dave Kleikamp470decc2006-10-11 01:20:57 -0700689 */
690 BUFFER_TRACE(bh, "dumping temporary bh");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700691 jbd2_journal_put_journal_head(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700692 __brelse(bh);
693 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
694 free_buffer_head(bh);
695
696 /* We also have to unlock and free the corresponding
697 shadowed buffer */
698 jh = commit_transaction->t_shadow_list->b_tprev;
699 bh = jh2bh(jh);
700 clear_bit(BH_JWrite, &bh->b_state);
701 J_ASSERT_BH(bh, buffer_jbddirty(bh));
702
703 /* The metadata is now released for reuse, but we need
704 to remember it against this transaction so that when
705 we finally commit, we can do any checkpointing
706 required. */
707 JBUFFER_TRACE(jh, "file as BJ_Forget");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700708 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700709 /* Wake up any transactions which were waiting for this
710 IO to complete */
711 wake_up_bit(&bh->b_state, BH_Unshadow);
712 JBUFFER_TRACE(jh, "brelse shadowed buffer");
713 __brelse(bh);
714 }
715
716 J_ASSERT (commit_transaction->t_shadow_list == NULL);
717
Jan Kara87c89c22008-07-11 19:27:31 -0400718 jbd_debug(3, "JBD: commit phase 4\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700719
720 /* Here we wait for the revoke record and descriptor record buffers */
721 wait_for_ctlbuf:
722 while (commit_transaction->t_log_list != NULL) {
723 struct buffer_head *bh;
724
725 jh = commit_transaction->t_log_list->b_tprev;
726 bh = jh2bh(jh);
727 if (buffer_locked(bh)) {
728 wait_on_buffer(bh);
729 goto wait_for_ctlbuf;
730 }
731 if (cond_resched())
732 goto wait_for_ctlbuf;
733
734 if (unlikely(!buffer_uptodate(bh)))
735 err = -EIO;
736
737 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
738 clear_buffer_jwrite(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700739 jbd2_journal_unfile_buffer(journal, jh);
740 jbd2_journal_put_journal_head(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700741 __brelse(bh); /* One for getblk */
742 /* AKPM: bforget here */
743 }
744
Jan Kara87c89c22008-07-11 19:27:31 -0400745 jbd_debug(3, "JBD: commit phase 5\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700746
Girish Shilamkar818d2762008-01-28 23:58:27 -0500747 if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
748 JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
749 err = journal_submit_commit_record(journal, commit_transaction,
750 &cbh, crc32_sum);
751 if (err)
752 __jbd2_journal_abort_hard(journal);
753 }
Mingming Caob048d842008-02-05 08:52:45 -0500754 if (!err && !is_journal_aborted(journal))
755 err = journal_wait_on_commit_record(cbh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700756
757 if (err)
Jan Karaa7fa2ba2007-10-16 18:38:25 -0400758 jbd2_journal_abort(journal, err);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700759
760 /* End of a transaction! Finally, we can do checkpoint
761 processing: any buffers committed as a result of this
762 transaction can be removed from any checkpoint list it was on
763 before. */
764
Jan Kara87c89c22008-07-11 19:27:31 -0400765 jbd_debug(3, "JBD: commit phase 6\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700766
Jan Karac851ed52008-07-11 19:27:31 -0400767 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
Dave Kleikamp470decc2006-10-11 01:20:57 -0700768 J_ASSERT(commit_transaction->t_buffers == NULL);
769 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
770 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
771 J_ASSERT(commit_transaction->t_shadow_list == NULL);
772 J_ASSERT(commit_transaction->t_log_list == NULL);
773
774restart_loop:
775 /*
776 * As there are other places (journal_unmap_buffer()) adding buffers
777 * to this list we have to be careful and hold the j_list_lock.
778 */
779 spin_lock(&journal->j_list_lock);
780 while (commit_transaction->t_forget) {
781 transaction_t *cp_transaction;
782 struct buffer_head *bh;
783
784 jh = commit_transaction->t_forget;
785 spin_unlock(&journal->j_list_lock);
786 bh = jh2bh(jh);
787 jbd_lock_bh_state(bh);
788 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
789 jh->b_transaction == journal->j_running_transaction);
790
791 /*
792 * If there is undo-protected committed data against
793 * this buffer, then we can remove it now. If it is a
794 * buffer needing such protection, the old frozen_data
795 * field now points to a committed version of the
796 * buffer, so rotate that field to the new committed
797 * data.
798 *
799 * Otherwise, we can just throw away the frozen data now.
800 */
801 if (jh->b_committed_data) {
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400802 jbd2_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700803 jh->b_committed_data = NULL;
804 if (jh->b_frozen_data) {
805 jh->b_committed_data = jh->b_frozen_data;
806 jh->b_frozen_data = NULL;
807 }
808 } else if (jh->b_frozen_data) {
Mingming Caoaf1e76d2007-10-16 18:38:25 -0400809 jbd2_free(jh->b_frozen_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700810 jh->b_frozen_data = NULL;
811 }
812
813 spin_lock(&journal->j_list_lock);
814 cp_transaction = jh->b_cp_transaction;
815 if (cp_transaction) {
816 JBUFFER_TRACE(jh, "remove from old cp transaction");
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500817 cp_transaction->t_chp_stats.cs_dropped++;
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700818 __jbd2_journal_remove_checkpoint(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700819 }
820
821 /* Only re-checkpoint the buffer_head if it is marked
822 * dirty. If the buffer was added to the BJ_Forget list
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700823 * by jbd2_journal_forget, it may no longer be dirty and
Dave Kleikamp470decc2006-10-11 01:20:57 -0700824 * there's no point in keeping a checkpoint record for
825 * it. */
826
827 /* A buffer which has been freed while still being
828 * journaled by a previous transaction may end up still
829 * being dirty here, but we want to avoid writing back
830 * that buffer in the future now that the last use has
831 * been committed. That's not only a performance gain,
832 * it also stops aliasing problems if the buffer is left
833 * behind for writeback and gets reallocated for another
834 * use in a different page. */
835 if (buffer_freed(bh)) {
836 clear_buffer_freed(bh);
837 clear_buffer_jbddirty(bh);
838 }
839
840 if (buffer_jbddirty(bh)) {
841 JBUFFER_TRACE(jh, "add to new checkpointing trans");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700842 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700843 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700844 __jbd2_journal_refile_buffer(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700845 jbd_unlock_bh_state(bh);
846 } else {
847 J_ASSERT_BH(bh, !buffer_dirty(bh));
848 /* The buffer on BJ_Forget list and not jbddirty means
849 * it has been freed by this transaction and hence it
850 * could not have been reallocated until this
851 * transaction has committed. *BUT* it could be
852 * reallocated once we have written all the data to
853 * disk and before we process the buffer on BJ_Forget
854 * list. */
855 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700856 __jbd2_journal_refile_buffer(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700857 if (!jh->b_transaction) {
858 jbd_unlock_bh_state(bh);
859 /* needs a brelse */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700860 jbd2_journal_remove_journal_head(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700861 release_buffer_page(bh);
862 } else
863 jbd_unlock_bh_state(bh);
864 }
865 cond_resched_lock(&journal->j_list_lock);
866 }
867 spin_unlock(&journal->j_list_lock);
868 /*
Jan Karaf5a7a6b2008-01-28 23:58:27 -0500869 * This is a bit sleazy. We use j_list_lock to protect transition
870 * of a transaction into T_FINISHED state and calling
871 * __jbd2_journal_drop_transaction(). Otherwise we could race with
872 * other checkpointing code processing the transaction...
Dave Kleikamp470decc2006-10-11 01:20:57 -0700873 */
874 spin_lock(&journal->j_state_lock);
875 spin_lock(&journal->j_list_lock);
876 /*
877 * Now recheck if some buffers did not get attached to the transaction
878 * while the lock was dropped...
879 */
880 if (commit_transaction->t_forget) {
881 spin_unlock(&journal->j_list_lock);
882 spin_unlock(&journal->j_state_lock);
883 goto restart_loop;
884 }
885
886 /* Done with this transaction! */
887
Jan Kara87c89c22008-07-11 19:27:31 -0400888 jbd_debug(3, "JBD: commit phase 7\n");
Dave Kleikamp470decc2006-10-11 01:20:57 -0700889
890 J_ASSERT(commit_transaction->t_state == T_COMMIT);
891
Johann Lombardi8e85fb32008-01-28 23:58:27 -0500892 commit_transaction->t_start = jiffies;
893 stats.u.run.rs_logging = jbd2_time_diff(stats.u.run.rs_logging,
894 commit_transaction->t_start);
895
896 /*
897 * File the transaction for history
898 */
899 stats.ts_type = JBD2_STATS_RUN;
900 stats.ts_tid = commit_transaction->t_tid;
901 stats.u.run.rs_handle_count = commit_transaction->t_handle_count;
902 spin_lock(&journal->j_history_lock);
903 memcpy(journal->j_history + journal->j_history_cur, &stats,
904 sizeof(stats));
905 if (++journal->j_history_cur == journal->j_history_max)
906 journal->j_history_cur = 0;
907
908 /*
909 * Calculate overall stats
910 */
911 journal->j_stats.ts_tid++;
912 journal->j_stats.u.run.rs_wait += stats.u.run.rs_wait;
913 journal->j_stats.u.run.rs_running += stats.u.run.rs_running;
914 journal->j_stats.u.run.rs_locked += stats.u.run.rs_locked;
915 journal->j_stats.u.run.rs_flushing += stats.u.run.rs_flushing;
916 journal->j_stats.u.run.rs_logging += stats.u.run.rs_logging;
917 journal->j_stats.u.run.rs_handle_count += stats.u.run.rs_handle_count;
918 journal->j_stats.u.run.rs_blocks += stats.u.run.rs_blocks;
919 journal->j_stats.u.run.rs_blocks_logged += stats.u.run.rs_blocks_logged;
920 spin_unlock(&journal->j_history_lock);
921
Dave Kleikamp470decc2006-10-11 01:20:57 -0700922 commit_transaction->t_state = T_FINISHED;
923 J_ASSERT(commit_transaction == journal->j_committing_transaction);
924 journal->j_commit_sequence = commit_transaction->t_tid;
925 journal->j_committing_transaction = NULL;
926 spin_unlock(&journal->j_state_lock);
927
Jan Karaf89b7792007-07-15 23:37:20 -0700928 if (commit_transaction->t_checkpoint_list == NULL &&
929 commit_transaction->t_checkpoint_io_list == NULL) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700930 __jbd2_journal_drop_transaction(journal, commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700931 } else {
932 if (journal->j_checkpoint_transactions == NULL) {
933 journal->j_checkpoint_transactions = commit_transaction;
934 commit_transaction->t_cpnext = commit_transaction;
935 commit_transaction->t_cpprev = commit_transaction;
936 } else {
937 commit_transaction->t_cpnext =
938 journal->j_checkpoint_transactions;
939 commit_transaction->t_cpprev =
940 commit_transaction->t_cpnext->t_cpprev;
941 commit_transaction->t_cpnext->t_cpprev =
942 commit_transaction;
943 commit_transaction->t_cpprev->t_cpnext =
944 commit_transaction;
945 }
946 }
947 spin_unlock(&journal->j_list_lock);
948
949 jbd_debug(1, "JBD: commit %d complete, head %d\n",
950 journal->j_commit_sequence, journal->j_tail_sequence);
951
952 wake_up(&journal->j_wait_done_commit);
953}