fs/jbd2/commit.c

   1 /*
   2  * linux/fs/jbd2/commit.c
   3  *
   4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
   5  *
   6  * Copyright 1998 Red Hat corp --- All Rights Reserved
   7  *
   8  * This file is part of the Linux kernel and is made available under
   9  * the terms of the GNU General Public License, version 2, or at your
  10  * option, any later version, incorporated herein by reference.
  11  *
  12  * Journal commit routines for the generic filesystem journaling code;
  13  * part of the ext2fs journaling system.
  14  */
  15
  16 #include <linux/time.h>
  17 #include <linux/fs.h>
  18 #include <linux/jbd2.h>
  19 #include <linux/errno.h>
  20 #include <linux/slab.h>
  21 #include <linux/mm.h>
  22 #include <linux/pagemap.h>
  23 #include <linux/jiffies.h>
  24 #include <linux/crc32.h>
  25 #include <linux/writeback.h>
  26 #include <linux/backing-dev.h>
  27 #include <linux/bio.h>
  28 #include <linux/blkdev.h>
  29 #include <linux/bitops.h>
  30 #include <trace/events/jbd2.h>
  31
  32 /*
  33  * Default IO end handler for temporary BJ_IO buffer_heads.
  34  */
  35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
  36 {
  37         BUFFER_TRACE(bh, "");
  38         if (uptodate)
  39                 set_buffer_uptodate(bh);
  40         else
  41                 clear_buffer_uptodate(bh);
  42         unlock_buffer(bh);
  43 }
  44
  45 /*
  46  * When an ext4 file is truncated, it is possible that some pages are not
  47  * successfully freed, because they are attached to a committing transaction.
  48  * After the transaction commits, these pages are left on the LRU, with no
  49  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
  50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
  51  * the numbers in /proc/meminfo look odd.
  52  *
  53  * So here, we have a buffer which has just come off the forget list.  Look to
  54  * see if we can strip all buffers from the backing page.
  55  *
  56  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
  57  * caller provided us with a ref against the buffer, and we drop that here.
  58  */
  59 static void release_buffer_page(struct buffer_head *bh)
  60 {
  61         struct page *page;
  62
  63         if (buffer_dirty(bh))
  64                 goto nope;
  65         if (atomic_read(&bh->b_count) != 1)
  66                 goto nope;
  67         page = bh->b_page;
  68         if (!page)
  69                 goto nope;
  70         if (page->mapping)
  71                 goto nope;
  72
  73         /* OK, it's a truncated page */
  74         if (!trylock_page(page))
  75                 goto nope;
  76
  77         page_cache_get(page);
  78         __brelse(bh);
  79         try_to_free_buffers(page);
  80         unlock_page(page);
  81         page_cache_release(page);
  82         return;
  83
  84 nope:
  85         __brelse(bh);
  86 }
  87
  88 static void jbd2_commit_block_csum_set(journal_t *j,
  89                                        struct journal_head *descriptor)
  90 {
  91         struct commit_header *h;
  92         __u32 csum;
  93
  94         if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
  95                 return;
  96
  97         h = (struct commit_header *)(jh2bh(descriptor)->b_data);
  98         h->h_chksum_type = 0;
  99         h->h_chksum_size = 0;
 100         h->h_chksum[0] = 0;
 101         csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
 102                            j->j_blocksize);
 103         h->h_chksum[0] = cpu_to_be32(csum);
 104 }
 105
 106 /*
 107  * Done it all: now submit the commit record.  We should have
 108  * cleaned up our previous buffers by now, so if we are in abort
 109  * mode we can now just skip the rest of the journal write
 110  * entirely.
 111  *
 112  * Returns 1 if the journal needs to be aborted or 0 on success
 113  */
 114 static int journal_submit_commit_record(journal_t *journal,
 115                                         transaction_t *commit_transaction,
 116                                         struct buffer_head **cbh,
 117                                         __u32 crc32_sum)
 118 {
 119         struct journal_head *descriptor;
 120         struct commit_header *tmp;
 121         struct buffer_head *bh;
 122         int ret;
 123         struct timespec now = current_kernel_time();
 124
 125         *cbh = NULL;
 126
 127         if (is_journal_aborted(journal))
 128                 return 0;
 129
 130         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 131         if (!descriptor)
 132                 return 1;
 133
 134         bh = jh2bh(descriptor);
 135
 136         tmp = (struct commit_header *)bh->b_data;
 137         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
 138         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
 139         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
 140         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
 141         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
 142
 143         if (JBD2_HAS_COMPAT_FEATURE(journal,
 144                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
 145                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
 146                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
 147                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
 148         }
 149         jbd2_commit_block_csum_set(journal, descriptor);
 150
 151         JBUFFER_TRACE(descriptor, "submit commit block");
 152         lock_buffer(bh);
 153         clear_buffer_dirty(bh);
 154         set_buffer_uptodate(bh);
 155         bh->b_end_io = journal_end_buffer_io_sync;
 156
 157         if (journal->j_flags & JBD2_BARRIER &&
 158             !JBD2_HAS_INCOMPAT_FEATURE(journal,
 159                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
 160                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
 161         else
 162                 ret = submit_bh(WRITE_SYNC, bh);
 163
 164         *cbh = bh;
 165         return ret;
 166 }
 167
 168 /*
 169  * This function along with journal_submit_commit_record
 170  * allows to write the commit record asynchronously.
 171  */
 172 static int journal_wait_on_commit_record(journal_t *journal,
 173                                          struct buffer_head *bh)
 174 {
 175         int ret = 0;
 176
 177         clear_buffer_dirty(bh);
 178         wait_on_buffer(bh);
 179
 180         if (unlikely(!buffer_uptodate(bh)))
 181                 ret = -EIO;
 182         put_bh(bh);            /* One for getblk() */
 183         jbd2_journal_put_journal_head(bh2jh(bh));
 184
 185         return ret;
 186 }
 187
 188 /*
 189  * write the filemap data using writepage() address_space_operations.
 190  * We don't do block allocation here even for delalloc. We don't
 191  * use writepages() because with dealyed allocation we may be doing
 192  * block allocation in writepages().
 193  */
 194 static int journal_submit_inode_data_buffers(struct address_space *mapping)
 195 {
 196         int ret;
 197         struct writeback_control wbc = {
 198                 .sync_mode =  WB_SYNC_ALL,
 199                 .nr_to_write = mapping->nrpages * 2,
 200                 .range_start = 0,
 201                 .range_end = i_size_read(mapping->host),
 202         };
 203
 204         ret = generic_writepages(mapping, &wbc);
 205         return ret;
 206 }
 207
 208 /*
 209  * Submit all the data buffers of inode associated with the transaction to
 210  * disk.
 211  *
 212  * We are in a committing transaction. Therefore no new inode can be added to
 213  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
 214  * operate on from being released while we write out pages.
 215  */
 216 static int journal_submit_data_buffers(journal_t *journal,
 217                 transaction_t *commit_transaction)
 218 {
 219         struct jbd2_inode *jinode;
 220         int err, ret = 0;
 221         struct address_space *mapping;
 222
 223         spin_lock(&journal->j_list_lock);
 224         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 225                 mapping = jinode->i_vfs_inode->i_mapping;
 226                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 227                 spin_unlock(&journal->j_list_lock);
 228                 /*
 229                  * submit the inode data buffers. We use writepage
 230                  * instead of writepages. Because writepages can do
 231                  * block allocation  with delalloc. We need to write
 232                  * only allocated blocks here.
 233                  */
 234                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
 235                 err = journal_submit_inode_data_buffers(mapping);
 236                 if (!ret)
 237                         ret = err;
 238                 spin_lock(&journal->j_list_lock);
 239                 J_ASSERT(jinode->i_transaction == commit_transaction);
 240                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 241                 smp_mb__after_clear_bit();
 242                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 243         }
 244         spin_unlock(&journal->j_list_lock);
 245         return ret;
 246 }
 247
 248 /*
 249  * Wait for data submitted for writeout, refile inodes to proper
 250  * transaction if needed.
 251  *
 252  */
 253 static int journal_finish_inode_data_buffers(journal_t *journal,
 254                 transaction_t *commit_transaction)
 255 {
 256         struct jbd2_inode *jinode, *next_i;
 257         int err, ret = 0;
 258
 259         /* For locking, see the comment in journal_submit_data_buffers() */
 260         spin_lock(&journal->j_list_lock);
 261         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
 262                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 263                 spin_unlock(&journal->j_list_lock);
 264                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
 265                 if (err) {
 266                         /*
 267                          * Because AS_EIO is cleared by
 268                          * filemap_fdatawait_range(), set it again so
 269                          * that user process can get -EIO from fsync().
 270                          */
 271                         set_bit(AS_EIO,
 272                                 &jinode->i_vfs_inode->i_mapping->flags);
 273
 274                         if (!ret)
 275                                 ret = err;
 276                 }
 277                 spin_lock(&journal->j_list_lock);
 278                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
 279                 smp_mb__after_clear_bit();
 280                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 281         }
 282
 283         /* Now refile inode to proper lists */
 284         list_for_each_entry_safe(jinode, next_i,
 285                                  &commit_transaction->t_inode_list, i_list) {
 286                 list_del(&jinode->i_list);
 287                 if (jinode->i_next_transaction) {
 288                         jinode->i_transaction = jinode->i_next_transaction;
 289                         jinode->i_next_transaction = NULL;
 290                         list_add(&jinode->i_list,
 291                                 &jinode->i_transaction->t_inode_list);
 292                 } else {
 293                         jinode->i_transaction = NULL;
 294                 }
 295         }
 296         spin_unlock(&journal->j_list_lock);
 297
 298         return ret;
 299 }
 300
 301 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
 302 {
 303         struct page *page = bh->b_page;
 304         char *addr;
 305         __u32 checksum;
 306
 307         addr = kmap_atomic(page);
 308         checksum = crc32_be(crc32_sum,
 309                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
 310         kunmap_atomic(addr);
 311
 312         return checksum;
 313 }
 314
 315 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
 316                                    unsigned long long block)
 317 {
 318         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
 319         if (tag_bytes > JBD2_TAG_SIZE32)
 320                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 321 }
 322
 323 static void jbd2_descr_block_csum_set(journal_t *j,
 324                                       struct journal_head *descriptor)
 325 {
 326         struct jbd2_journal_block_tail *tail;
 327         __u32 csum;
 328
 329         if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 330                 return;
 331
 332         tail = (struct jbd2_journal_block_tail *)
 333                         (jh2bh(descriptor)->b_data + j->j_blocksize -
 334                         sizeof(struct jbd2_journal_block_tail));
 335         tail->t_checksum = 0;
 336         csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
 337                            j->j_blocksize);
 338         tail->t_checksum = cpu_to_be32(csum);
 339 }
 340
 341 /*
 342  * jbd2_journal_commit_transaction
 343  *
 344  * The primary function for committing a transaction to the log.  This
 345  * function is called by the journal thread to begin a complete commit.
 346  */
 347 void jbd2_journal_commit_transaction(journal_t *journal)
 348 {
 349         struct transaction_stats_s stats;
 350         transaction_t *commit_transaction;
 351         struct journal_head *jh, *new_jh, *descriptor;
 352         struct buffer_head **wbuf = journal->j_wbuf;
 353         int bufs;
 354         int flags;
 355         int err;
 356         unsigned long long blocknr;
 357         ktime_t start_time;
 358         u64 commit_time;
 359         char *tagp = NULL;
 360         journal_header_t *header;
 361         journal_block_tag_t *tag = NULL;
 362         int space_left = 0;
 363         int first_tag = 0;
 364         int tag_flag;
 365         int i, to_free = 0;
 366         int tag_bytes = journal_tag_bytes(journal);
 367         struct buffer_head *cbh = NULL; /* For transactional checksums */
 368         __u32 crc32_sum = ~0;
 369         struct blk_plug plug;
 370         /* Tail of the journal */
 371         unsigned long first_block;
 372         tid_t first_tid;
 373         int update_tail;
 374         int csum_size = 0;
 375
 376         if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
 377                 csum_size = sizeof(struct jbd2_journal_block_tail);
 378
 379         /*
 380          * First job: lock down the current transaction and wait for
 381          * all outstanding updates to complete.
 382          */
 383
 384         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
 385         if (journal->j_flags & JBD2_FLUSHED) {
 386                 jbd_debug(3, "super block updated\n");
 387                 mutex_lock(&journal->j_checkpoint_mutex);
 388                 /*
 389                  * We hold j_checkpoint_mutex so tail cannot change under us.
 390                  * We don't need any special data guarantees for writing sb
 391                  * since journal is empty and it is ok for write to be
 392                  * flushed only with transaction commit.
 393                  */
 394                 jbd2_journal_update_sb_log_tail(journal,
 395                                                 journal->j_tail_sequence,
 396                                                 journal->j_tail,
 397                                                 WRITE_SYNC);
 398                 mutex_unlock(&journal->j_checkpoint_mutex);
 399         } else {
 400                 jbd_debug(3, "superblock not updated\n");
 401         }
 402
 403         J_ASSERT(journal->j_running_transaction != NULL);
 404         J_ASSERT(journal->j_committing_transaction == NULL);
 405
 406         commit_transaction = journal->j_running_transaction;
 407         J_ASSERT(commit_transaction->t_state == T_RUNNING);
 408
 409         trace_jbd2_start_commit(journal, commit_transaction);
 410         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
 411                         commit_transaction->t_tid);
 412
 413         write_lock(&journal->j_state_lock);
 414         commit_transaction->t_state = T_LOCKED;
 415
 416         trace_jbd2_commit_locking(journal, commit_transaction);
 417         stats.run.rs_wait = commit_transaction->t_max_wait;
 418         stats.run.rs_locked = jiffies;
 419         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
 420                                               stats.run.rs_locked);
 421
 422         spin_lock(&commit_transaction->t_handle_lock);
 423         while (atomic_read(&commit_transaction->t_updates)) {
 424                 DEFINE_WAIT(wait);
 425
 426                 prepare_to_wait(&journal->j_wait_updates, &wait,
 427                                         TASK_UNINTERRUPTIBLE);
 428                 if (atomic_read(&commit_transaction->t_updates)) {
 429                         spin_unlock(&commit_transaction->t_handle_lock);
 430                         write_unlock(&journal->j_state_lock);
 431                         schedule();
 432                         write_lock(&journal->j_state_lock);
 433                         spin_lock(&commit_transaction->t_handle_lock);
 434                 }
 435                 finish_wait(&journal->j_wait_updates, &wait);
 436         }
 437         spin_unlock(&commit_transaction->t_handle_lock);
 438
 439         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
 440                         journal->j_max_transaction_buffers);
 441
 442         /*
 443          * First thing we are allowed to do is to discard any remaining
 444          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
 445          * that there are no such buffers: if a large filesystem
 446          * operation like a truncate needs to split itself over multiple
 447          * transactions, then it may try to do a jbd2_journal_restart() while
 448          * there are still BJ_Reserved buffers outstanding.  These must
 449          * be released cleanly from the current transaction.
 450          *
 451          * In this case, the filesystem must still reserve write access
 452          * again before modifying the buffer in the new transaction, but
 453          * we do not require it to remember exactly which old buffers it
 454          * has reserved.  This is consistent with the existing behaviour
 455          * that multiple jbd2_journal_get_write_access() calls to the same
 456          * buffer are perfectly permissible.
 457          */
 458         while (commit_transaction->t_reserved_list) {
 459                 jh = commit_transaction->t_reserved_list;
 460                 JBUFFER_TRACE(jh, "reserved, unused: refile");
 461                 /*
 462                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
 463                  * leave undo-committed data.
 464                  */
 465                 if (jh->b_committed_data) {
 466                         struct buffer_head *bh = jh2bh(jh);
 467
 468                         jbd_lock_bh_state(bh);
 469                         jbd2_free(jh->b_committed_data, bh->b_size);
 470                         jh->b_committed_data = NULL;
 471                         jbd_unlock_bh_state(bh);
 472                 }
 473                 jbd2_journal_refile_buffer(journal, jh);
 474         }
 475
 476         /*
 477          * Now try to drop any written-back buffers from the journal's
 478          * checkpoint lists.  We do this *before* commit because it potentially
 479          * frees some memory
 480          */
 481         spin_lock(&journal->j_list_lock);
 482         __jbd2_journal_clean_checkpoint_list(journal);
 483         spin_unlock(&journal->j_list_lock);
 484
 485         jbd_debug(3, "JBD2: commit phase 1\n");
 486
 487         /*
 488          * Clear revoked flag to reflect there is no revoked buffers
 489          * in the next transaction which is going to be started.
 490          */
 491         jbd2_clear_buffer_revoked_flags(journal);
 492
 493         /*
 494          * Switch to a new revoke table.
 495          */
 496         jbd2_journal_switch_revoke_table(journal);
 497
 498         trace_jbd2_commit_flushing(journal, commit_transaction);
 499         stats.run.rs_flushing = jiffies;
 500         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
 501                                              stats.run.rs_flushing);
 502
 503         commit_transaction->t_state = T_FLUSH;
 504         journal->j_committing_transaction = commit_transaction;
 505         journal->j_running_transaction = NULL;
 506         start_time = ktime_get();
 507         commit_transaction->t_log_start = journal->j_head;
 508         wake_up(&journal->j_wait_transaction_locked);
 509         write_unlock(&journal->j_state_lock);
 510
 511         jbd_debug(3, "JBD2: commit phase 2\n");
 512
 513         /*
 514          * Now start flushing things to disk, in the order they appear
 515          * on the transaction lists.  Data blocks go first.
 516          */
 517         err = journal_submit_data_buffers(journal, commit_transaction);
 518         if (err)
 519                 jbd2_journal_abort(journal, err);
 520
 521         blk_start_plug(&plug);
 522         jbd2_journal_write_revoke_records(journal, commit_transaction,
 523                                           WRITE_SYNC);
 524         blk_finish_plug(&plug);
 525
 526         jbd_debug(3, "JBD2: commit phase 2\n");
 527
 528         /*
 529          * Way to go: we have now written out all of the data for a
 530          * transaction!  Now comes the tricky part: we need to write out
 531          * metadata.  Loop over the transaction's entire buffer list:
 532          */
 533         write_lock(&journal->j_state_lock);
 534         commit_transaction->t_state = T_COMMIT;
 535         write_unlock(&journal->j_state_lock);
 536
 537         trace_jbd2_commit_logging(journal, commit_transaction);
 538         stats.run.rs_logging = jiffies;
 539         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
 540                                                stats.run.rs_logging);
 541         stats.run.rs_blocks =
 542                 atomic_read(&commit_transaction->t_outstanding_credits);
 543         stats.run.rs_blocks_logged = 0;
 544
 545         J_ASSERT(commit_transaction->t_nr_buffers <=
 546                  atomic_read(&commit_transaction->t_outstanding_credits));
 547
 548         err = 0;
 549         descriptor = NULL;
 550         bufs = 0;
 551         blk_start_plug(&plug);
 552         while (commit_transaction->t_buffers) {
 553
 554                 /* Find the next buffer to be journaled... */
 555
 556                 jh = commit_transaction->t_buffers;
 557
 558                 /* If we're in abort mode, we just un-journal the buffer and
 559                    release it. */
 560
 561                 if (is_journal_aborted(journal)) {
 562                         clear_buffer_jbddirty(jh2bh(jh));
 563                         JBUFFER_TRACE(jh, "journal is aborting: refile");
 564                         jbd2_buffer_abort_trigger(jh,
 565                                                   jh->b_frozen_data ?
 566                                                   jh->b_frozen_triggers :
 567                                                   jh->b_triggers);
 568                         jbd2_journal_refile_buffer(journal, jh);
 569                         /* If that was the last one, we need to clean up
 570                          * any descriptor buffers which may have been
 571                          * already allocated, even if we are now
 572                          * aborting. */
 573                         if (!commit_transaction->t_buffers)
 574                                 goto start_journal_io;
 575                         continue;
 576                 }
 577
 578                 /* Make sure we have a descriptor block in which to
 579                    record the metadata buffer. */
 580
 581                 if (!descriptor) {
 582                         struct buffer_head *bh;
 583
 584                         J_ASSERT (bufs == 0);
 585
 586                         jbd_debug(4, "JBD2: get descriptor\n");
 587
 588                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
 589                         if (!descriptor) {
 590                                 jbd2_journal_abort(journal, -EIO);
 591                                 continue;
 592                         }
 593
 594                         bh = jh2bh(descriptor);
 595                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
 596                                 (unsigned long long)bh->b_blocknr, bh->b_data);
 597                         header = (journal_header_t *)&bh->b_data[0];
 598                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
 599                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
 600                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
 601
 602                         tagp = &bh->b_data[sizeof(journal_header_t)];
 603                         space_left = bh->b_size - sizeof(journal_header_t);
 604                         first_tag = 1;
 605                         set_buffer_jwrite(bh);
 606                         set_buffer_dirty(bh);
 607                         wbuf[bufs++] = bh;
 608
 609                         /* Record it so that we can wait for IO
 610                            completion later */
 611                         BUFFER_TRACE(bh, "ph3: file as descriptor");
 612                         jbd2_journal_file_buffer(descriptor, commit_transaction,
 613                                         BJ_LogCtl);
 614                 }
 615
 616                 /* Where is the buffer to be written? */
 617
 618                 err = jbd2_journal_next_log_block(journal, &blocknr);
 619                 /* If the block mapping failed, just abandon the buffer
 620                    and repeat this loop: we'll fall into the
 621                    refile-on-abort condition above. */
 622                 if (err) {
 623                         jbd2_journal_abort(journal, err);
 624                         continue;
 625                 }
 626
 627                 /*
 628                  * start_this_handle() uses t_outstanding_credits to determine
 629                  * the free space in the log, but this counter is changed
 630                  * by jbd2_journal_next_log_block() also.
 631                  */
 632                 atomic_dec(&commit_transaction->t_outstanding_credits);
 633
 634                 /* Bump b_count to prevent truncate from stumbling over
 635                    the shadowed buffer!  @@@ This can go if we ever get
 636                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
 637                 atomic_inc(&jh2bh(jh)->b_count);
 638
 639                 /* Make a temporary IO buffer with which to write it out
 640                    (this will requeue both the metadata buffer and the
 641                    temporary IO buffer). new_bh goes on BJ_IO*/
 642
 643                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
 644                 /*
 645                  * akpm: jbd2_journal_write_metadata_buffer() sets
 646                  * new_bh->b_transaction to commit_transaction.
 647                  * We need to clean this up before we release new_bh
 648                  * (which is of type BJ_IO)
 649                  */
 650                 JBUFFER_TRACE(jh, "ph3: write metadata");
 651                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
 652                                                       jh, &new_jh, blocknr);
 653                 if (flags < 0) {
 654                         jbd2_journal_abort(journal, flags);
 655                         continue;
 656                 }
 657                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
 658                 wbuf[bufs++] = jh2bh(new_jh);
 659
 660                 /* Record the new block's tag in the current descriptor
 661                    buffer */
 662
 663                 tag_flag = 0;
 664                 if (flags & 1)
 665                         tag_flag |= JBD2_FLAG_ESCAPE;
 666                 if (!first_tag)
 667                         tag_flag |= JBD2_FLAG_SAME_UUID;
 668
 669                 tag = (journal_block_tag_t *) tagp;
 670                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
 671                 tag->t_flags = cpu_to_be16(tag_flag);
 672                 tagp += tag_bytes;
 673                 space_left -= tag_bytes;
 674
 675                 if (first_tag) {
 676                         memcpy (tagp, journal->j_uuid, 16);
 677                         tagp += 16;
 678                         space_left -= 16;
 679                         first_tag = 0;
 680                 }
 681
 682                 /* If there's no more to do, or if the descriptor is full,
 683                    let the IO rip! */
 684
 685                 if (bufs == journal->j_wbufsize ||
 686                     commit_transaction->t_buffers == NULL ||
 687                     space_left < tag_bytes + 16 + csum_size) {
 688
 689                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 690
 691                         /* Write an end-of-descriptor marker before
 692                            submitting the IOs.  "tag" still points to
 693                            the last tag we set up. */
 694
 695                         tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 696
 697                         jbd2_descr_block_csum_set(journal, descriptor);
 698 start_journal_io:
 699                         for (i = 0; i < bufs; i++) {
 700                                 struct buffer_head *bh = wbuf[i];
 701                                 /*
 702                                  * Compute checksum.
 703                                  */
 704                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
 705                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
 706                                         crc32_sum =
 707                                             jbd2_checksum_data(crc32_sum, bh);
 708                                 }
 709
 710                                 lock_buffer(bh);
 711                                 clear_buffer_dirty(bh);
 712                                 set_buffer_uptodate(bh);
 713                                 bh->b_end_io = journal_end_buffer_io_sync;
 714                                 submit_bh(WRITE_SYNC, bh);
 715                         }
 716                         cond_resched();
 717                         stats.run.rs_blocks_logged += bufs;
 718
 719                         /* Force a new descriptor to be generated next
 720                            time round the loop. */
 721                         descriptor = NULL;
 722                         bufs = 0;
 723                 }
 724         }
 725
 726         err = journal_finish_inode_data_buffers(journal, commit_transaction);
 727         if (err) {
 728                 printk(KERN_WARNING
 729                         "JBD2: Detected IO errors while flushing file data "
 730                        "on %s\n", journal->j_devname);
 731                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
 732                         jbd2_journal_abort(journal, err);
 733                 err = 0;
 734         }
 735
 736         /*
 737          * Get current oldest transaction in the log before we issue flush
 738          * to the filesystem device. After the flush we can be sure that
 739          * blocks of all older transactions are checkpointed to persistent
 740          * storage and we will be safe to update journal start in the
 741          * superblock with the numbers we get here.
 742          */
 743         update_tail =
 744                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
 745
 746         write_lock(&journal->j_state_lock);
 747         if (update_tail) {
 748                 long freed = first_block - journal->j_tail;
 749
 750                 if (first_block < journal->j_tail)
 751                         freed += journal->j_last - journal->j_first;
 752                 /* Update tail only if we free significant amount of space */
 753                 if (freed < journal->j_maxlen / 4)
 754                         update_tail = 0;
 755         }
 756         J_ASSERT(commit_transaction->t_state == T_COMMIT);
 757         commit_transaction->t_state = T_COMMIT_DFLUSH;
 758         write_unlock(&journal->j_state_lock);
 759
 760         /*
 761          * If the journal is not located on the file system device,
 762          * then we must flush the file system device before we issue
 763          * the commit record
 764          */
 765         if (commit_transaction->t_need_data_flush &&
 766             (journal->j_fs_dev != journal->j_dev) &&
 767             (journal->j_flags & JBD2_BARRIER))
 768                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 769
 770         /* Done it all: now write the commit record asynchronously. */
 771         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 772                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 773                 err = journal_submit_commit_record(journal, commit_transaction,
 774                                                  &cbh, crc32_sum);
 775                 if (err)
 776                         __jbd2_journal_abort_hard(journal);
 777         }
 778
 779         blk_finish_plug(&plug);
 780
 781         /* Lo and behold: we have just managed to send a transaction to
 782            the log.  Before we can commit it, wait for the IO so far to
 783            complete.  Control buffers being written are on the
 784            transaction's t_log_list queue, and metadata buffers are on
 785            the t_iobuf_list queue.
 786
 787            Wait for the buffers in reverse order.  That way we are
 788            less likely to be woken up until all IOs have completed, and
 789            so we incur less scheduling load.
 790         */
 791
 792         jbd_debug(3, "JBD2: commit phase 3\n");
 793
 794         /*
 795          * akpm: these are BJ_IO, and j_list_lock is not needed.
 796          * See __journal_try_to_free_buffer.
 797          */
 798 wait_for_iobuf:
 799         while (commit_transaction->t_iobuf_list != NULL) {
 800                 struct buffer_head *bh;
 801
 802                 jh = commit_transaction->t_iobuf_list->b_tprev;
 803                 bh = jh2bh(jh);
 804                 if (buffer_locked(bh)) {
 805                         wait_on_buffer(bh);
 806                         goto wait_for_iobuf;
 807                 }
 808                 if (cond_resched())
 809                         goto wait_for_iobuf;
 810
 811                 if (unlikely(!buffer_uptodate(bh)))
 812                         err = -EIO;
 813
 814                 clear_buffer_jwrite(bh);
 815
 816                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
 817                 jbd2_journal_unfile_buffer(journal, jh);
 818
 819                 /*
 820                  * ->t_iobuf_list should contain only dummy buffer_heads
 821                  * which were created by jbd2_journal_write_metadata_buffer().
 822                  */
 823                 BUFFER_TRACE(bh, "dumping temporary bh");
 824                 jbd2_journal_put_journal_head(jh);
 825                 __brelse(bh);
 826                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
 827                 free_buffer_head(bh);
 828
 829                 /* We also have to unlock and free the corresponding
 830                    shadowed buffer */
 831                 jh = commit_transaction->t_shadow_list->b_tprev;
 832                 bh = jh2bh(jh);
 833                 clear_bit(BH_JWrite, &bh->b_state);
 834                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
 835
 836                 /* The metadata is now released for reuse, but we need
 837                    to remember it against this transaction so that when
 838                    we finally commit, we can do any checkpointing
 839                    required. */
 840                 JBUFFER_TRACE(jh, "file as BJ_Forget");
 841                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
 842                 /*
 843                  * Wake up any transactions which were waiting for this IO to
 844                  * complete. The barrier must be here so that changes by
 845                  * jbd2_journal_file_buffer() take effect before wake_up_bit()
 846                  * does the waitqueue check.
 847                  */
 848                 smp_mb();
 849                 wake_up_bit(&bh->b_state, BH_Unshadow);
 850                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
 851                 __brelse(bh);
 852         }
 853
 854         J_ASSERT (commit_transaction->t_shadow_list == NULL);
 855
 856         jbd_debug(3, "JBD2: commit phase 4\n");
 857
 858         /* Here we wait for the revoke record and descriptor record buffers */
 859  wait_for_ctlbuf:
 860         while (commit_transaction->t_log_list != NULL) {
 861                 struct buffer_head *bh;
 862
 863                 jh = commit_transaction->t_log_list->b_tprev;
 864                 bh = jh2bh(jh);
 865                 if (buffer_locked(bh)) {
 866                         wait_on_buffer(bh);
 867                         goto wait_for_ctlbuf;
 868                 }
 869                 if (cond_resched())
 870                         goto wait_for_ctlbuf;
 871
 872                 if (unlikely(!buffer_uptodate(bh)))
 873                         err = -EIO;
 874
 875                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
 876                 clear_buffer_jwrite(bh);
 877                 jbd2_journal_unfile_buffer(journal, jh);
 878                 jbd2_journal_put_journal_head(jh);
 879                 __brelse(bh);           /* One for getblk */
 880                 /* AKPM: bforget here */
 881         }
 882
 883         if (err)
 884                 jbd2_journal_abort(journal, err);
 885
 886         jbd_debug(3, "JBD2: commit phase 5\n");
 887         write_lock(&journal->j_state_lock);
 888         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
 889         commit_transaction->t_state = T_COMMIT_JFLUSH;
 890         write_unlock(&journal->j_state_lock);
 891
 892         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
 893                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
 894                 err = journal_submit_commit_record(journal, commit_transaction,
 895                                                 &cbh, crc32_sum);
 896                 if (err)
 897                         __jbd2_journal_abort_hard(journal);
 898         }
 899         if (cbh)
 900                 err = journal_wait_on_commit_record(journal, cbh);
 901         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
 902                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
 903             journal->j_flags & JBD2_BARRIER) {
 904                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
 905         }
 906
 907         if (err)
 908                 jbd2_journal_abort(journal, err);
 909
 910         /*
 911          * Now disk caches for filesystem device are flushed so we are safe to
 912          * erase checkpointed transactions from the log by updating journal
 913          * superblock.
 914          */
 915         if (update_tail)
 916                 jbd2_update_log_tail(journal, first_tid, first_block);
 917
 918         /* End of a transaction!  Finally, we can do checkpoint
 919            processing: any buffers committed as a result of this
 920            transaction can be removed from any checkpoint list it was on
 921            before. */
 922
 923         jbd_debug(3, "JBD2: commit phase 6\n");
 924
 925         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
 926         J_ASSERT(commit_transaction->t_buffers == NULL);
 927         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
 928         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
 929         J_ASSERT(commit_transaction->t_shadow_list == NULL);
 930         J_ASSERT(commit_transaction->t_log_list == NULL);
 931
 932 restart_loop:
 933         /*
 934          * As there are other places (journal_unmap_buffer()) adding buffers
 935          * to this list we have to be careful and hold the j_list_lock.
 936          */
 937         spin_lock(&journal->j_list_lock);
 938         while (commit_transaction->t_forget) {
 939                 transaction_t *cp_transaction;
 940                 struct buffer_head *bh;
 941                 int try_to_free = 0;
 942
 943                 jh = commit_transaction->t_forget;
 944                 spin_unlock(&journal->j_list_lock);
 945                 bh = jh2bh(jh);
 946                 /*
 947                  * Get a reference so that bh cannot be freed before we are
 948                  * done with it.
 949                  */
 950                 get_bh(bh);
 951                 jbd_lock_bh_state(bh);
 952                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
 953
 954                 /*
 955                  * If there is undo-protected committed data against
 956                  * this buffer, then we can remove it now.  If it is a
 957                  * buffer needing such protection, the old frozen_data
 958                  * field now points to a committed version of the
 959                  * buffer, so rotate that field to the new committed
 960                  * data.
 961                  *
 962                  * Otherwise, we can just throw away the frozen data now.
 963                  *
 964                  * We also know that the frozen data has already fired
 965                  * its triggers if they exist, so we can clear that too.
 966                  */
 967                 if (jh->b_committed_data) {
 968                         jbd2_free(jh->b_committed_data, bh->b_size);
 969                         jh->b_committed_data = NULL;
 970                         if (jh->b_frozen_data) {
 971                                 jh->b_committed_data = jh->b_frozen_data;
 972                                 jh->b_frozen_data = NULL;
 973                                 jh->b_frozen_triggers = NULL;
 974                         }
 975                 } else if (jh->b_frozen_data) {
 976                         jbd2_free(jh->b_frozen_data, bh->b_size);
 977                         jh->b_frozen_data = NULL;
 978                         jh->b_frozen_triggers = NULL;
 979                 }
 980
 981                 spin_lock(&journal->j_list_lock);
 982                 cp_transaction = jh->b_cp_transaction;
 983                 if (cp_transaction) {
 984                         JBUFFER_TRACE(jh, "remove from old cp transaction");
 985                         cp_transaction->t_chp_stats.cs_dropped++;
 986                         __jbd2_journal_remove_checkpoint(jh);
 987                 }
 988
 989                 /* Only re-checkpoint the buffer_head if it is marked
 990                  * dirty.  If the buffer was added to the BJ_Forget list
 991                  * by jbd2_journal_forget, it may no longer be dirty and
 992                  * there's no point in keeping a checkpoint record for
 993                  * it. */
 994
 995                 /* A buffer which has been freed while still being
 996                  * journaled by a previous transaction may end up still
 997                  * being dirty here, but we want to avoid writing back
 998                  * that buffer in the future after the "add to orphan"
 999                  * operation been committed,  That's not only a performance
1000                  * gain, it also stops aliasing problems if the buffer is
1001                  * left behind for writeback and gets reallocated for another
1002                  * use in a different page. */
1003                 if (buffer_freed(bh) && !jh->b_next_transaction) {
1004                         clear_buffer_freed(bh);
1005                         clear_buffer_jbddirty(bh);
1006                 }
1007
1008                 if (buffer_jbddirty(bh)) {
1009                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
1010                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1011                         if (is_journal_aborted(journal))
1012                                 clear_buffer_jbddirty(bh);
1013                 } else {
1014                         J_ASSERT_BH(bh, !buffer_dirty(bh));
1015                         /*
1016                          * The buffer on BJ_Forget list and not jbddirty means
1017                          * it has been freed by this transaction and hence it
1018                          * could not have been reallocated until this
1019                          * transaction has committed. *BUT* it could be
1020                          * reallocated once we have written all the data to
1021                          * disk and before we process the buffer on BJ_Forget
1022                          * list.
1023                          */
1024                         if (!jh->b_next_transaction)
1025                                 try_to_free = 1;
1026                 }
1027                 JBUFFER_TRACE(jh, "refile or unfile buffer");
1028                 __jbd2_journal_refile_buffer(jh);
1029                 jbd_unlock_bh_state(bh);
1030                 if (try_to_free)
1031                         release_buffer_page(bh);        /* Drops bh reference */
1032                 else
1033                         __brelse(bh);
1034                 cond_resched_lock(&journal->j_list_lock);
1035         }
1036         spin_unlock(&journal->j_list_lock);
1037         /*
1038          * This is a bit sleazy.  We use j_list_lock to protect transition
1039          * of a transaction into T_FINISHED state and calling
1040          * __jbd2_journal_drop_transaction(). Otherwise we could race with
1041          * other checkpointing code processing the transaction...
1042          */
1043         write_lock(&journal->j_state_lock);
1044         spin_lock(&journal->j_list_lock);
1045         /*
1046          * Now recheck if some buffers did not get attached to the transaction
1047          * while the lock was dropped...
1048          */
1049         if (commit_transaction->t_forget) {
1050                 spin_unlock(&journal->j_list_lock);
1051                 write_unlock(&journal->j_state_lock);
1052                 goto restart_loop;
1053         }
1054
1055         /* Done with this transaction! */
1056
1057         jbd_debug(3, "JBD2: commit phase 7\n");
1058
1059         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1060
1061         commit_transaction->t_start = jiffies;
1062         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1063                                               commit_transaction->t_start);
1064
1065         /*
1066          * File the transaction statistics
1067          */
1068         stats.ts_tid = commit_transaction->t_tid;
1069         stats.run.rs_handle_count =
1070                 atomic_read(&commit_transaction->t_handle_count);
1071         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1072                              commit_transaction->t_tid, &stats.run);
1073
1074         /*
1075          * Calculate overall stats
1076          */
1077         spin_lock(&journal->j_history_lock);
1078         journal->j_stats.ts_tid++;
1079         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1080         journal->j_stats.run.rs_running += stats.run.rs_running;
1081         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1082         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1083         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1084         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1085         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1086         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1087         spin_unlock(&journal->j_history_lock);
1088
1089         commit_transaction->t_state = T_FINISHED;
1090         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1091         journal->j_commit_sequence = commit_transaction->t_tid;
1092         journal->j_committing_transaction = NULL;
1093         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1094
1095         /*
1096          * weight the commit time higher than the average time so we don't
1097          * react too strongly to vast changes in the commit time
1098          */
1099         if (likely(journal->j_average_commit_time))
1100                 journal->j_average_commit_time = (commit_time +
1101                                 journal->j_average_commit_time*3) / 4;
1102         else
1103                 journal->j_average_commit_time = commit_time;
1104         write_unlock(&journal->j_state_lock);
1105
1106         if (commit_transaction->t_checkpoint_list == NULL &&
1107             commit_transaction->t_checkpoint_io_list == NULL) {
1108                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1109                 to_free = 1;
1110         } else {
1111                 if (journal->j_checkpoint_transactions == NULL) {
1112                         journal->j_checkpoint_transactions = commit_transaction;
1113                         commit_transaction->t_cpnext = commit_transaction;
1114                         commit_transaction->t_cpprev = commit_transaction;
1115                 } else {
1116                         commit_transaction->t_cpnext =
1117                                 journal->j_checkpoint_transactions;
1118                         commit_transaction->t_cpprev =
1119                                 commit_transaction->t_cpnext->t_cpprev;
1120                         commit_transaction->t_cpnext->t_cpprev =
1121                                 commit_transaction;
1122                         commit_transaction->t_cpprev->t_cpnext =
1123                                 commit_transaction;
1124                 }
1125         }
1126         spin_unlock(&journal->j_list_lock);
1127
1128         if (journal->j_commit_callback)
1129                 journal->j_commit_callback(journal, commit_transaction);
1130
1131         trace_jbd2_end_commit(journal, commit_transaction);
1132         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1133                   journal->j_commit_sequence, journal->j_tail_sequence);
1134         if (to_free)
1135                 jbd2_journal_free_transaction(commit_transaction);
1136
1137         wake_up(&journal->j_wait_done_commit);
1138 }