2 * linux/fs/jbd2/commit.c
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
16 #include <linux/time.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
33 * IO end handler for temporary buffer_heads handling writes to the journal.
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
37 struct buffer_head *orig_bh = bh->b_private;
41 set_buffer_uptodate(bh);
43 clear_buffer_uptodate(bh);
45 clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
46 smp_mb__after_atomic();
47 wake_up_bit(&orig_bh->b_state, BH_Shadow);
53 * When an ext4 file is truncated, it is possible that some pages are not
54 * successfully freed, because they are attached to a committing transaction.
55 * After the transaction commits, these pages are left on the LRU, with no
56 * ->mapping, and with attached buffers. These pages are trivially reclaimable
57 * by the VM, but their apparent absence upsets the VM accounting, and it makes
58 * the numbers in /proc/meminfo look odd.
60 * So here, we have a buffer which has just come off the forget list. Look to
61 * see if we can strip all buffers from the backing page.
63 * Called under lock_journal(), and possibly under journal_datalist_lock. The
64 * caller provided us with a ref against the buffer, and we drop that here.
66 static void release_buffer_page(struct buffer_head *bh)
72 if (atomic_read(&bh->b_count) != 1)
80 /* OK, it's a truncated page */
81 if (!trylock_page(page))
86 try_to_free_buffers(page);
88 page_cache_release(page);
95 static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
97 struct commit_header *h;
100 if (!jbd2_journal_has_csum_v2or3(j))
103 h = (struct commit_header *)(bh->b_data);
104 h->h_chksum_type = 0;
105 h->h_chksum_size = 0;
107 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
108 h->h_chksum[0] = cpu_to_be32(csum);
112 * Done it all: now submit the commit record. We should have
113 * cleaned up our previous buffers by now, so if we are in abort
114 * mode we can now just skip the rest of the journal write
117 * Returns 1 if the journal needs to be aborted or 0 on success
119 static int journal_submit_commit_record(journal_t *journal,
120 transaction_t *commit_transaction,
121 struct buffer_head **cbh,
124 struct commit_header *tmp;
125 struct buffer_head *bh;
127 struct timespec now = current_kernel_time();
131 if (is_journal_aborted(journal))
134 bh = jbd2_journal_get_descriptor_buffer(journal);
138 tmp = (struct commit_header *)bh->b_data;
139 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
140 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
141 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
142 tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
143 tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
145 if (jbd2_has_feature_checksum(journal)) {
146 tmp->h_chksum_type = JBD2_CRC32_CHKSUM;
147 tmp->h_chksum_size = JBD2_CRC32_CHKSUM_SIZE;
148 tmp->h_chksum[0] = cpu_to_be32(crc32_sum);
150 jbd2_commit_block_csum_set(journal, bh);
152 BUFFER_TRACE(bh, "submit commit block");
154 clear_buffer_dirty(bh);
155 set_buffer_uptodate(bh);
156 bh->b_end_io = journal_end_buffer_io_sync;
158 if (journal->j_flags & JBD2_BARRIER &&
159 !jbd2_has_feature_async_commit(journal))
160 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
162 ret = submit_bh(WRITE_SYNC, bh);
169 * This function along with journal_submit_commit_record
170 * allows to write the commit record asynchronously.
172 static int journal_wait_on_commit_record(journal_t *journal,
173 struct buffer_head *bh)
177 clear_buffer_dirty(bh);
180 if (unlikely(!buffer_uptodate(bh)))
182 put_bh(bh); /* One for getblk() */
188 * write the filemap data using writepage() address_space_operations.
189 * We don't do block allocation here even for delalloc. We don't
190 * use writepages() because with dealyed allocation we may be doing
191 * block allocation in writepages().
193 static int journal_submit_inode_data_buffers(struct address_space *mapping)
196 struct writeback_control wbc = {
197 .sync_mode = WB_SYNC_ALL,
198 .nr_to_write = mapping->nrpages * 2,
200 .range_end = i_size_read(mapping->host),
203 ret = generic_writepages(mapping, &wbc);
208 * Submit all the data buffers of inode associated with the transaction to
211 * We are in a committing transaction. Therefore no new inode can be added to
212 * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
213 * operate on from being released while we write out pages.
215 static int journal_submit_data_buffers(journal_t *journal,
216 transaction_t *commit_transaction)
218 struct jbd2_inode *jinode;
220 struct address_space *mapping;
222 spin_lock(&journal->j_list_lock);
223 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
224 mapping = jinode->i_vfs_inode->i_mapping;
225 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
226 spin_unlock(&journal->j_list_lock);
228 * submit the inode data buffers. We use writepage
229 * instead of writepages. Because writepages can do
230 * block allocation with delalloc. We need to write
231 * only allocated blocks here.
233 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
234 err = journal_submit_inode_data_buffers(mapping);
237 spin_lock(&journal->j_list_lock);
238 J_ASSERT(jinode->i_transaction == commit_transaction);
239 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
240 smp_mb__after_atomic();
241 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243 spin_unlock(&journal->j_list_lock);
248 * Wait for data submitted for writeout, refile inodes to proper
249 * transaction if needed.
252 static int journal_finish_inode_data_buffers(journal_t *journal,
253 transaction_t *commit_transaction)
255 struct jbd2_inode *jinode, *next_i;
258 /* For locking, see the comment in journal_submit_data_buffers() */
259 spin_lock(&journal->j_list_lock);
260 list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
261 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
262 spin_unlock(&journal->j_list_lock);
263 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
266 * Because AS_EIO is cleared by
267 * filemap_fdatawait_range(), set it again so
268 * that user process can get -EIO from fsync().
271 &jinode->i_vfs_inode->i_mapping->flags);
276 spin_lock(&journal->j_list_lock);
277 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
278 smp_mb__after_atomic();
279 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
282 /* Now refile inode to proper lists */
283 list_for_each_entry_safe(jinode, next_i,
284 &commit_transaction->t_inode_list, i_list) {
285 list_del(&jinode->i_list);
286 if (jinode->i_next_transaction) {
287 jinode->i_transaction = jinode->i_next_transaction;
288 jinode->i_next_transaction = NULL;
289 list_add(&jinode->i_list,
290 &jinode->i_transaction->t_inode_list);
292 jinode->i_transaction = NULL;
295 spin_unlock(&journal->j_list_lock);
300 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302 struct page *page = bh->b_page;
306 addr = kmap_atomic(page);
307 checksum = crc32_be(crc32_sum,
308 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
314 static void write_tag_block(journal_t *j, journal_block_tag_t *tag,
315 unsigned long long block)
317 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
318 if (jbd2_has_feature_64bit(j))
319 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
322 static void jbd2_descr_block_csum_set(journal_t *j,
323 struct buffer_head *bh)
325 struct jbd2_journal_block_tail *tail;
328 if (!jbd2_journal_has_csum_v2or3(j))
331 tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
332 sizeof(struct jbd2_journal_block_tail));
333 tail->t_checksum = 0;
334 csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
335 tail->t_checksum = cpu_to_be32(csum);
338 static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
339 struct buffer_head *bh, __u32 sequence)
341 journal_block_tag3_t *tag3 = (journal_block_tag3_t *)tag;
342 struct page *page = bh->b_page;
347 if (!jbd2_journal_has_csum_v2or3(j))
350 seq = cpu_to_be32(sequence);
351 addr = kmap_atomic(page);
352 csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
353 csum32 = jbd2_chksum(j, csum32, addr + offset_in_page(bh->b_data),
357 if (jbd2_has_feature_csum3(j))
358 tag3->t_checksum = cpu_to_be32(csum32);
360 tag->t_checksum = cpu_to_be16(csum32);
363 * jbd2_journal_commit_transaction
365 * The primary function for committing a transaction to the log. This
366 * function is called by the journal thread to begin a complete commit.
368 void jbd2_journal_commit_transaction(journal_t *journal)
370 struct transaction_stats_s stats;
371 transaction_t *commit_transaction;
372 struct journal_head *jh;
373 struct buffer_head *descriptor;
374 struct buffer_head **wbuf = journal->j_wbuf;
378 unsigned long long blocknr;
382 journal_header_t *header;
383 journal_block_tag_t *tag = NULL;
388 int tag_bytes = journal_tag_bytes(journal);
389 struct buffer_head *cbh = NULL; /* For transactional checksums */
390 __u32 crc32_sum = ~0;
391 struct blk_plug plug;
392 /* Tail of the journal */
393 unsigned long first_block;
400 if (jbd2_journal_has_csum_v2or3(journal))
401 csum_size = sizeof(struct jbd2_journal_block_tail);
404 * First job: lock down the current transaction and wait for
405 * all outstanding updates to complete.
408 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
409 if (journal->j_flags & JBD2_FLUSHED) {
410 jbd_debug(3, "super block updated\n");
411 mutex_lock(&journal->j_checkpoint_mutex);
413 * We hold j_checkpoint_mutex so tail cannot change under us.
414 * We don't need any special data guarantees for writing sb
415 * since journal is empty and it is ok for write to be
416 * flushed only with transaction commit.
418 jbd2_journal_update_sb_log_tail(journal,
419 journal->j_tail_sequence,
422 mutex_unlock(&journal->j_checkpoint_mutex);
424 jbd_debug(3, "superblock not updated\n");
427 J_ASSERT(journal->j_running_transaction != NULL);
428 J_ASSERT(journal->j_committing_transaction == NULL);
430 commit_transaction = journal->j_running_transaction;
432 trace_jbd2_start_commit(journal, commit_transaction);
433 jbd_debug(1, "JBD2: starting commit of transaction %d\n",
434 commit_transaction->t_tid);
436 write_lock(&journal->j_state_lock);
437 J_ASSERT(commit_transaction->t_state == T_RUNNING);
438 commit_transaction->t_state = T_LOCKED;
440 trace_jbd2_commit_locking(journal, commit_transaction);
441 stats.run.rs_wait = commit_transaction->t_max_wait;
442 stats.run.rs_request_delay = 0;
443 stats.run.rs_locked = jiffies;
444 if (commit_transaction->t_requested)
445 stats.run.rs_request_delay =
446 jbd2_time_diff(commit_transaction->t_requested,
447 stats.run.rs_locked);
448 stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
449 stats.run.rs_locked);
451 spin_lock(&commit_transaction->t_handle_lock);
452 while (atomic_read(&commit_transaction->t_updates)) {
455 prepare_to_wait(&journal->j_wait_updates, &wait,
456 TASK_UNINTERRUPTIBLE);
457 if (atomic_read(&commit_transaction->t_updates)) {
458 spin_unlock(&commit_transaction->t_handle_lock);
459 write_unlock(&journal->j_state_lock);
461 write_lock(&journal->j_state_lock);
462 spin_lock(&commit_transaction->t_handle_lock);
464 finish_wait(&journal->j_wait_updates, &wait);
466 spin_unlock(&commit_transaction->t_handle_lock);
468 J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
469 journal->j_max_transaction_buffers);
472 * First thing we are allowed to do is to discard any remaining
473 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
474 * that there are no such buffers: if a large filesystem
475 * operation like a truncate needs to split itself over multiple
476 * transactions, then it may try to do a jbd2_journal_restart() while
477 * there are still BJ_Reserved buffers outstanding. These must
478 * be released cleanly from the current transaction.
480 * In this case, the filesystem must still reserve write access
481 * again before modifying the buffer in the new transaction, but
482 * we do not require it to remember exactly which old buffers it
483 * has reserved. This is consistent with the existing behaviour
484 * that multiple jbd2_journal_get_write_access() calls to the same
485 * buffer are perfectly permissible.
487 while (commit_transaction->t_reserved_list) {
488 jh = commit_transaction->t_reserved_list;
489 JBUFFER_TRACE(jh, "reserved, unused: refile");
491 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
492 * leave undo-committed data.
494 if (jh->b_committed_data) {
495 struct buffer_head *bh = jh2bh(jh);
497 jbd_lock_bh_state(bh);
498 jbd2_free(jh->b_committed_data, bh->b_size);
499 jh->b_committed_data = NULL;
500 jbd_unlock_bh_state(bh);
502 jbd2_journal_refile_buffer(journal, jh);
506 * Now try to drop any written-back buffers from the journal's
507 * checkpoint lists. We do this *before* commit because it potentially
510 spin_lock(&journal->j_list_lock);
511 __jbd2_journal_clean_checkpoint_list(journal, false);
512 spin_unlock(&journal->j_list_lock);
514 jbd_debug(3, "JBD2: commit phase 1\n");
517 * Clear revoked flag to reflect there is no revoked buffers
518 * in the next transaction which is going to be started.
520 jbd2_clear_buffer_revoked_flags(journal);
523 * Switch to a new revoke table.
525 jbd2_journal_switch_revoke_table(journal);
528 * Reserved credits cannot be claimed anymore, free them
530 atomic_sub(atomic_read(&journal->j_reserved_credits),
531 &commit_transaction->t_outstanding_credits);
533 trace_jbd2_commit_flushing(journal, commit_transaction);
534 stats.run.rs_flushing = jiffies;
535 stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
536 stats.run.rs_flushing);
538 commit_transaction->t_state = T_FLUSH;
539 journal->j_committing_transaction = commit_transaction;
540 journal->j_running_transaction = NULL;
541 start_time = ktime_get();
542 commit_transaction->t_log_start = journal->j_head;
543 wake_up(&journal->j_wait_transaction_locked);
544 write_unlock(&journal->j_state_lock);
546 jbd_debug(3, "JBD2: commit phase 2a\n");
549 * Now start flushing things to disk, in the order they appear
550 * on the transaction lists. Data blocks go first.
552 err = journal_submit_data_buffers(journal, commit_transaction);
554 jbd2_journal_abort(journal, err);
556 blk_start_plug(&plug);
557 jbd2_journal_write_revoke_records(journal, commit_transaction,
558 &log_bufs, WRITE_SYNC);
560 jbd_debug(3, "JBD2: commit phase 2b\n");
563 * Way to go: we have now written out all of the data for a
564 * transaction! Now comes the tricky part: we need to write out
565 * metadata. Loop over the transaction's entire buffer list:
567 write_lock(&journal->j_state_lock);
568 commit_transaction->t_state = T_COMMIT;
569 write_unlock(&journal->j_state_lock);
571 trace_jbd2_commit_logging(journal, commit_transaction);
572 stats.run.rs_logging = jiffies;
573 stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
574 stats.run.rs_logging);
575 stats.run.rs_blocks =
576 atomic_read(&commit_transaction->t_outstanding_credits);
577 stats.run.rs_blocks_logged = 0;
579 J_ASSERT(commit_transaction->t_nr_buffers <=
580 atomic_read(&commit_transaction->t_outstanding_credits));
585 while (commit_transaction->t_buffers) {
587 /* Find the next buffer to be journaled... */
589 jh = commit_transaction->t_buffers;
591 /* If we're in abort mode, we just un-journal the buffer and
594 if (is_journal_aborted(journal)) {
595 clear_buffer_jbddirty(jh2bh(jh));
596 JBUFFER_TRACE(jh, "journal is aborting: refile");
597 jbd2_buffer_abort_trigger(jh,
599 jh->b_frozen_triggers :
601 jbd2_journal_refile_buffer(journal, jh);
602 /* If that was the last one, we need to clean up
603 * any descriptor buffers which may have been
604 * already allocated, even if we are now
606 if (!commit_transaction->t_buffers)
607 goto start_journal_io;
611 /* Make sure we have a descriptor block in which to
612 record the metadata buffer. */
615 J_ASSERT (bufs == 0);
617 jbd_debug(4, "JBD2: get descriptor\n");
619 descriptor = jbd2_journal_get_descriptor_buffer(journal);
621 jbd2_journal_abort(journal, -EIO);
625 jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
626 (unsigned long long)descriptor->b_blocknr,
628 header = (journal_header_t *)descriptor->b_data;
629 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
630 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
631 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
633 tagp = &descriptor->b_data[sizeof(journal_header_t)];
634 space_left = descriptor->b_size -
635 sizeof(journal_header_t);
637 set_buffer_jwrite(descriptor);
638 set_buffer_dirty(descriptor);
639 wbuf[bufs++] = descriptor;
641 /* Record it so that we can wait for IO
643 BUFFER_TRACE(descriptor, "ph3: file as descriptor");
644 jbd2_file_log_bh(&log_bufs, descriptor);
647 /* Where is the buffer to be written? */
649 err = jbd2_journal_next_log_block(journal, &blocknr);
650 /* If the block mapping failed, just abandon the buffer
651 and repeat this loop: we'll fall into the
652 refile-on-abort condition above. */
654 jbd2_journal_abort(journal, err);
659 * start_this_handle() uses t_outstanding_credits to determine
660 * the free space in the log, but this counter is changed
661 * by jbd2_journal_next_log_block() also.
663 atomic_dec(&commit_transaction->t_outstanding_credits);
665 /* Bump b_count to prevent truncate from stumbling over
666 the shadowed buffer! @@@ This can go if we ever get
667 rid of the shadow pairing of buffers. */
668 atomic_inc(&jh2bh(jh)->b_count);
671 * Make a temporary IO buffer with which to write it out
672 * (this will requeue the metadata buffer to BJ_Shadow).
674 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
675 JBUFFER_TRACE(jh, "ph3: write metadata");
676 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
677 jh, &wbuf[bufs], blocknr);
679 jbd2_journal_abort(journal, flags);
682 jbd2_file_log_bh(&io_bufs, wbuf[bufs]);
684 /* Record the new block's tag in the current descriptor
689 tag_flag |= JBD2_FLAG_ESCAPE;
691 tag_flag |= JBD2_FLAG_SAME_UUID;
693 tag = (journal_block_tag_t *) tagp;
694 write_tag_block(journal, tag, jh2bh(jh)->b_blocknr);
695 tag->t_flags = cpu_to_be16(tag_flag);
696 jbd2_block_tag_csum_set(journal, tag, wbuf[bufs],
697 commit_transaction->t_tid);
699 space_left -= tag_bytes;
703 memcpy (tagp, journal->j_uuid, 16);
709 /* If there's no more to do, or if the descriptor is full,
712 if (bufs == journal->j_wbufsize ||
713 commit_transaction->t_buffers == NULL ||
714 space_left < tag_bytes + 16 + csum_size) {
716 jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
718 /* Write an end-of-descriptor marker before
719 submitting the IOs. "tag" still points to
720 the last tag we set up. */
722 tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
724 jbd2_descr_block_csum_set(journal, descriptor);
726 for (i = 0; i < bufs; i++) {
727 struct buffer_head *bh = wbuf[i];
731 if (jbd2_has_feature_checksum(journal)) {
733 jbd2_checksum_data(crc32_sum, bh);
737 clear_buffer_dirty(bh);
738 set_buffer_uptodate(bh);
739 bh->b_end_io = journal_end_buffer_io_sync;
740 submit_bh(WRITE_SYNC, bh);
743 stats.run.rs_blocks_logged += bufs;
745 /* Force a new descriptor to be generated next
746 time round the loop. */
752 err = journal_finish_inode_data_buffers(journal, commit_transaction);
755 "JBD2: Detected IO errors while flushing file data "
756 "on %s\n", journal->j_devname);
757 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
758 jbd2_journal_abort(journal, err);
763 * Get current oldest transaction in the log before we issue flush
764 * to the filesystem device. After the flush we can be sure that
765 * blocks of all older transactions are checkpointed to persistent
766 * storage and we will be safe to update journal start in the
767 * superblock with the numbers we get here.
770 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
772 write_lock(&journal->j_state_lock);
774 long freed = first_block - journal->j_tail;
776 if (first_block < journal->j_tail)
777 freed += journal->j_last - journal->j_first;
778 /* Update tail only if we free significant amount of space */
779 if (freed < journal->j_maxlen / 4)
782 J_ASSERT(commit_transaction->t_state == T_COMMIT);
783 commit_transaction->t_state = T_COMMIT_DFLUSH;
784 write_unlock(&journal->j_state_lock);
787 * If the journal is not located on the file system device,
788 * then we must flush the file system device before we issue
791 if (commit_transaction->t_need_data_flush &&
792 (journal->j_fs_dev != journal->j_dev) &&
793 (journal->j_flags & JBD2_BARRIER))
794 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
796 /* Done it all: now write the commit record asynchronously. */
797 if (jbd2_has_feature_async_commit(journal)) {
798 err = journal_submit_commit_record(journal, commit_transaction,
801 __jbd2_journal_abort_hard(journal);
804 blk_finish_plug(&plug);
806 /* Lo and behold: we have just managed to send a transaction to
807 the log. Before we can commit it, wait for the IO so far to
808 complete. Control buffers being written are on the
809 transaction's t_log_list queue, and metadata buffers are on
812 Wait for the buffers in reverse order. That way we are
813 less likely to be woken up until all IOs have completed, and
814 so we incur less scheduling load.
817 jbd_debug(3, "JBD2: commit phase 3\n");
819 while (!list_empty(&io_bufs)) {
820 struct buffer_head *bh = list_entry(io_bufs.prev,
827 if (unlikely(!buffer_uptodate(bh)))
829 jbd2_unfile_log_bh(bh);
832 * The list contains temporary buffer heads created by
833 * jbd2_journal_write_metadata_buffer().
835 BUFFER_TRACE(bh, "dumping temporary bh");
837 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
838 free_buffer_head(bh);
840 /* We also have to refile the corresponding shadowed buffer */
841 jh = commit_transaction->t_shadow_list->b_tprev;
843 clear_buffer_jwrite(bh);
844 J_ASSERT_BH(bh, buffer_jbddirty(bh));
845 J_ASSERT_BH(bh, !buffer_shadow(bh));
847 /* The metadata is now released for reuse, but we need
848 to remember it against this transaction so that when
849 we finally commit, we can do any checkpointing
851 JBUFFER_TRACE(jh, "file as BJ_Forget");
852 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
853 JBUFFER_TRACE(jh, "brelse shadowed buffer");
857 J_ASSERT (commit_transaction->t_shadow_list == NULL);
859 jbd_debug(3, "JBD2: commit phase 4\n");
861 /* Here we wait for the revoke record and descriptor record buffers */
862 while (!list_empty(&log_bufs)) {
863 struct buffer_head *bh;
865 bh = list_entry(log_bufs.prev, struct buffer_head, b_assoc_buffers);
869 if (unlikely(!buffer_uptodate(bh)))
872 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
873 clear_buffer_jwrite(bh);
874 jbd2_unfile_log_bh(bh);
875 __brelse(bh); /* One for getblk */
876 /* AKPM: bforget here */
880 jbd2_journal_abort(journal, err);
882 jbd_debug(3, "JBD2: commit phase 5\n");
883 write_lock(&journal->j_state_lock);
884 J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
885 commit_transaction->t_state = T_COMMIT_JFLUSH;
886 write_unlock(&journal->j_state_lock);
888 if (!jbd2_has_feature_async_commit(journal)) {
889 err = journal_submit_commit_record(journal, commit_transaction,
892 __jbd2_journal_abort_hard(journal);
895 err = journal_wait_on_commit_record(journal, cbh);
896 if (jbd2_has_feature_async_commit(journal) &&
897 journal->j_flags & JBD2_BARRIER) {
898 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
902 jbd2_journal_abort(journal, err);
905 * Now disk caches for filesystem device are flushed so we are safe to
906 * erase checkpointed transactions from the log by updating journal
910 jbd2_update_log_tail(journal, first_tid, first_block);
912 /* End of a transaction! Finally, we can do checkpoint
913 processing: any buffers committed as a result of this
914 transaction can be removed from any checkpoint list it was on
917 jbd_debug(3, "JBD2: commit phase 6\n");
919 J_ASSERT(list_empty(&commit_transaction->t_inode_list));
920 J_ASSERT(commit_transaction->t_buffers == NULL);
921 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
922 J_ASSERT(commit_transaction->t_shadow_list == NULL);
926 * As there are other places (journal_unmap_buffer()) adding buffers
927 * to this list we have to be careful and hold the j_list_lock.
929 spin_lock(&journal->j_list_lock);
930 while (commit_transaction->t_forget) {
931 transaction_t *cp_transaction;
932 struct buffer_head *bh;
935 jh = commit_transaction->t_forget;
936 spin_unlock(&journal->j_list_lock);
939 * Get a reference so that bh cannot be freed before we are
943 jbd_lock_bh_state(bh);
944 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
947 * If there is undo-protected committed data against
948 * this buffer, then we can remove it now. If it is a
949 * buffer needing such protection, the old frozen_data
950 * field now points to a committed version of the
951 * buffer, so rotate that field to the new committed
954 * Otherwise, we can just throw away the frozen data now.
956 * We also know that the frozen data has already fired
957 * its triggers if they exist, so we can clear that too.
959 if (jh->b_committed_data) {
960 jbd2_free(jh->b_committed_data, bh->b_size);
961 jh->b_committed_data = NULL;
962 if (jh->b_frozen_data) {
963 jh->b_committed_data = jh->b_frozen_data;
964 jh->b_frozen_data = NULL;
965 jh->b_frozen_triggers = NULL;
967 } else if (jh->b_frozen_data) {
968 jbd2_free(jh->b_frozen_data, bh->b_size);
969 jh->b_frozen_data = NULL;
970 jh->b_frozen_triggers = NULL;
973 spin_lock(&journal->j_list_lock);
974 cp_transaction = jh->b_cp_transaction;
975 if (cp_transaction) {
976 JBUFFER_TRACE(jh, "remove from old cp transaction");
977 cp_transaction->t_chp_stats.cs_dropped++;
978 __jbd2_journal_remove_checkpoint(jh);
981 /* Only re-checkpoint the buffer_head if it is marked
982 * dirty. If the buffer was added to the BJ_Forget list
983 * by jbd2_journal_forget, it may no longer be dirty and
984 * there's no point in keeping a checkpoint record for
988 * A buffer which has been freed while still being journaled by
989 * a previous transaction.
991 if (buffer_freed(bh)) {
993 * If the running transaction is the one containing
994 * "add to orphan" operation (b_next_transaction !=
995 * NULL), we have to wait for that transaction to
996 * commit before we can really get rid of the buffer.
997 * So just clear b_modified to not confuse transaction
998 * credit accounting and refile the buffer to
999 * BJ_Forget of the running transaction. If the just
1000 * committed transaction contains "add to orphan"
1001 * operation, we can completely invalidate the buffer
1002 * now. We are rather through in that since the
1003 * buffer may be still accessible when blocksize <
1004 * pagesize and it is attached to the last partial
1008 if (!jh->b_next_transaction) {
1009 clear_buffer_freed(bh);
1010 clear_buffer_jbddirty(bh);
1011 clear_buffer_mapped(bh);
1012 clear_buffer_new(bh);
1013 clear_buffer_req(bh);
1018 if (buffer_jbddirty(bh)) {
1019 JBUFFER_TRACE(jh, "add to new checkpointing trans");
1020 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1021 if (is_journal_aborted(journal))
1022 clear_buffer_jbddirty(bh);
1024 J_ASSERT_BH(bh, !buffer_dirty(bh));
1026 * The buffer on BJ_Forget list and not jbddirty means
1027 * it has been freed by this transaction and hence it
1028 * could not have been reallocated until this
1029 * transaction has committed. *BUT* it could be
1030 * reallocated once we have written all the data to
1031 * disk and before we process the buffer on BJ_Forget
1034 if (!jh->b_next_transaction)
1037 JBUFFER_TRACE(jh, "refile or unfile buffer");
1038 __jbd2_journal_refile_buffer(jh);
1039 jbd_unlock_bh_state(bh);
1041 release_buffer_page(bh); /* Drops bh reference */
1044 cond_resched_lock(&journal->j_list_lock);
1046 spin_unlock(&journal->j_list_lock);
1048 * This is a bit sleazy. We use j_list_lock to protect transition
1049 * of a transaction into T_FINISHED state and calling
1050 * __jbd2_journal_drop_transaction(). Otherwise we could race with
1051 * other checkpointing code processing the transaction...
1053 write_lock(&journal->j_state_lock);
1054 spin_lock(&journal->j_list_lock);
1056 * Now recheck if some buffers did not get attached to the transaction
1057 * while the lock was dropped...
1059 if (commit_transaction->t_forget) {
1060 spin_unlock(&journal->j_list_lock);
1061 write_unlock(&journal->j_state_lock);
1065 /* Add the transaction to the checkpoint list
1066 * __journal_remove_checkpoint() can not destroy transaction
1067 * under us because it is not marked as T_FINISHED yet */
1068 if (journal->j_checkpoint_transactions == NULL) {
1069 journal->j_checkpoint_transactions = commit_transaction;
1070 commit_transaction->t_cpnext = commit_transaction;
1071 commit_transaction->t_cpprev = commit_transaction;
1073 commit_transaction->t_cpnext =
1074 journal->j_checkpoint_transactions;
1075 commit_transaction->t_cpprev =
1076 commit_transaction->t_cpnext->t_cpprev;
1077 commit_transaction->t_cpnext->t_cpprev =
1079 commit_transaction->t_cpprev->t_cpnext =
1082 spin_unlock(&journal->j_list_lock);
1084 /* Done with this transaction! */
1086 jbd_debug(3, "JBD2: commit phase 7\n");
1088 J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1090 commit_transaction->t_start = jiffies;
1091 stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1092 commit_transaction->t_start);
1095 * File the transaction statistics
1097 stats.ts_tid = commit_transaction->t_tid;
1098 stats.run.rs_handle_count =
1099 atomic_read(&commit_transaction->t_handle_count);
1100 trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1101 commit_transaction->t_tid, &stats.run);
1102 stats.ts_requested = (commit_transaction->t_requested) ? 1 : 0;
1104 commit_transaction->t_state = T_COMMIT_CALLBACK;
1105 J_ASSERT(commit_transaction == journal->j_committing_transaction);
1106 journal->j_commit_sequence = commit_transaction->t_tid;
1107 journal->j_committing_transaction = NULL;
1108 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1111 * weight the commit time higher than the average time so we don't
1112 * react too strongly to vast changes in the commit time
1114 if (likely(journal->j_average_commit_time))
1115 journal->j_average_commit_time = (commit_time +
1116 journal->j_average_commit_time*3) / 4;
1118 journal->j_average_commit_time = commit_time;
1120 write_unlock(&journal->j_state_lock);
1122 if (journal->j_commit_callback)
1123 journal->j_commit_callback(journal, commit_transaction);
1125 trace_jbd2_end_commit(journal, commit_transaction);
1126 jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1127 journal->j_commit_sequence, journal->j_tail_sequence);
1129 write_lock(&journal->j_state_lock);
1130 spin_lock(&journal->j_list_lock);
1131 commit_transaction->t_state = T_FINISHED;
1132 /* Check if the transaction can be dropped now that we are finished */
1133 if (commit_transaction->t_checkpoint_list == NULL &&
1134 commit_transaction->t_checkpoint_io_list == NULL) {
1135 __jbd2_journal_drop_transaction(journal, commit_transaction);
1136 jbd2_journal_free_transaction(commit_transaction);
1138 spin_unlock(&journal->j_list_lock);
1139 write_unlock(&journal->j_state_lock);
1140 wake_up(&journal->j_wait_done_commit);
1143 * Calculate overall stats
1145 spin_lock(&journal->j_history_lock);
1146 journal->j_stats.ts_tid++;
1147 journal->j_stats.ts_requested += stats.ts_requested;
1148 journal->j_stats.run.rs_wait += stats.run.rs_wait;
1149 journal->j_stats.run.rs_request_delay += stats.run.rs_request_delay;
1150 journal->j_stats.run.rs_running += stats.run.rs_running;
1151 journal->j_stats.run.rs_locked += stats.run.rs_locked;
1152 journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1153 journal->j_stats.run.rs_logging += stats.run.rs_logging;
1154 journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1155 journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1156 journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1157 spin_unlock(&journal->j_history_lock);