jbd2: checksum commit blocks
[firefly-linux-kernel-4.4.55.git] / fs / jbd2 / commit.c
1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31
32 /*
33  * Default IO end handler for temporary BJ_IO buffer_heads.
34  */
35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37         BUFFER_TRACE(bh, "");
38         if (uptodate)
39                 set_buffer_uptodate(bh);
40         else
41                 clear_buffer_uptodate(bh);
42         unlock_buffer(bh);
43 }
44
45 /*
46  * When an ext4 file is truncated, it is possible that some pages are not
47  * successfully freed, because they are attached to a committing transaction.
48  * After the transaction commits, these pages are left on the LRU, with no
49  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
51  * the numbers in /proc/meminfo look odd.
52  *
53  * So here, we have a buffer which has just come off the forget list.  Look to
54  * see if we can strip all buffers from the backing page.
55  *
56  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
57  * caller provided us with a ref against the buffer, and we drop that here.
58  */
59 static void release_buffer_page(struct buffer_head *bh)
60 {
61         struct page *page;
62
63         if (buffer_dirty(bh))
64                 goto nope;
65         if (atomic_read(&bh->b_count) != 1)
66                 goto nope;
67         page = bh->b_page;
68         if (!page)
69                 goto nope;
70         if (page->mapping)
71                 goto nope;
72
73         /* OK, it's a truncated page */
74         if (!trylock_page(page))
75                 goto nope;
76
77         page_cache_get(page);
78         __brelse(bh);
79         try_to_free_buffers(page);
80         unlock_page(page);
81         page_cache_release(page);
82         return;
83
84 nope:
85         __brelse(bh);
86 }
87
88 static void jbd2_commit_block_csum_set(journal_t *j,
89                                        struct journal_head *descriptor)
90 {
91         struct commit_header *h;
92         __u32 csum;
93
94         if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
95                 return;
96
97         h = (struct commit_header *)(jh2bh(descriptor)->b_data);
98         h->h_chksum_type = 0;
99         h->h_chksum_size = 0;
100         h->h_chksum[0] = 0;
101         csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
102                            j->j_blocksize);
103         h->h_chksum[0] = cpu_to_be32(csum);
104 }
105
106 /*
107  * Done it all: now submit the commit record.  We should have
108  * cleaned up our previous buffers by now, so if we are in abort
109  * mode we can now just skip the rest of the journal write
110  * entirely.
111  *
112  * Returns 1 if the journal needs to be aborted or 0 on success
113  */
114 static int journal_submit_commit_record(journal_t *journal,
115                                         transaction_t *commit_transaction,
116                                         struct buffer_head **cbh,
117                                         __u32 crc32_sum)
118 {
119         struct journal_head *descriptor;
120         struct commit_header *tmp;
121         struct buffer_head *bh;
122         int ret;
123         struct timespec now = current_kernel_time();
124
125         *cbh = NULL;
126
127         if (is_journal_aborted(journal))
128                 return 0;
129
130         descriptor = jbd2_journal_get_descriptor_buffer(journal);
131         if (!descriptor)
132                 return 1;
133
134         bh = jh2bh(descriptor);
135
136         tmp = (struct commit_header *)bh->b_data;
137         tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
138         tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
139         tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
140         tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
141         tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
142
143         if (JBD2_HAS_COMPAT_FEATURE(journal,
144                                     JBD2_FEATURE_COMPAT_CHECKSUM)) {
145                 tmp->h_chksum_type      = JBD2_CRC32_CHKSUM;
146                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
147                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
148         }
149         jbd2_commit_block_csum_set(journal, descriptor);
150
151         JBUFFER_TRACE(descriptor, "submit commit block");
152         lock_buffer(bh);
153         clear_buffer_dirty(bh);
154         set_buffer_uptodate(bh);
155         bh->b_end_io = journal_end_buffer_io_sync;
156
157         if (journal->j_flags & JBD2_BARRIER &&
158             !JBD2_HAS_INCOMPAT_FEATURE(journal,
159                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
160                 ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
161         else
162                 ret = submit_bh(WRITE_SYNC, bh);
163
164         *cbh = bh;
165         return ret;
166 }
167
168 /*
169  * This function along with journal_submit_commit_record
170  * allows to write the commit record asynchronously.
171  */
172 static int journal_wait_on_commit_record(journal_t *journal,
173                                          struct buffer_head *bh)
174 {
175         int ret = 0;
176
177         clear_buffer_dirty(bh);
178         wait_on_buffer(bh);
179
180         if (unlikely(!buffer_uptodate(bh)))
181                 ret = -EIO;
182         put_bh(bh);            /* One for getblk() */
183         jbd2_journal_put_journal_head(bh2jh(bh));
184
185         return ret;
186 }
187
188 /*
189  * write the filemap data using writepage() address_space_operations.
190  * We don't do block allocation here even for delalloc. We don't
191  * use writepages() because with dealyed allocation we may be doing
192  * block allocation in writepages().
193  */
194 static int journal_submit_inode_data_buffers(struct address_space *mapping)
195 {
196         int ret;
197         struct writeback_control wbc = {
198                 .sync_mode =  WB_SYNC_ALL,
199                 .nr_to_write = mapping->nrpages * 2,
200                 .range_start = 0,
201                 .range_end = i_size_read(mapping->host),
202         };
203
204         ret = generic_writepages(mapping, &wbc);
205         return ret;
206 }
207
208 /*
209  * Submit all the data buffers of inode associated with the transaction to
210  * disk.
211  *
212  * We are in a committing transaction. Therefore no new inode can be added to
213  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
214  * operate on from being released while we write out pages.
215  */
216 static int journal_submit_data_buffers(journal_t *journal,
217                 transaction_t *commit_transaction)
218 {
219         struct jbd2_inode *jinode;
220         int err, ret = 0;
221         struct address_space *mapping;
222
223         spin_lock(&journal->j_list_lock);
224         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
225                 mapping = jinode->i_vfs_inode->i_mapping;
226                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
227                 spin_unlock(&journal->j_list_lock);
228                 /*
229                  * submit the inode data buffers. We use writepage
230                  * instead of writepages. Because writepages can do
231                  * block allocation  with delalloc. We need to write
232                  * only allocated blocks here.
233                  */
234                 trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
235                 err = journal_submit_inode_data_buffers(mapping);
236                 if (!ret)
237                         ret = err;
238                 spin_lock(&journal->j_list_lock);
239                 J_ASSERT(jinode->i_transaction == commit_transaction);
240                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
241                 smp_mb__after_clear_bit();
242                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
243         }
244         spin_unlock(&journal->j_list_lock);
245         return ret;
246 }
247
248 /*
249  * Wait for data submitted for writeout, refile inodes to proper
250  * transaction if needed.
251  *
252  */
253 static int journal_finish_inode_data_buffers(journal_t *journal,
254                 transaction_t *commit_transaction)
255 {
256         struct jbd2_inode *jinode, *next_i;
257         int err, ret = 0;
258
259         /* For locking, see the comment in journal_submit_data_buffers() */
260         spin_lock(&journal->j_list_lock);
261         list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
262                 set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
263                 spin_unlock(&journal->j_list_lock);
264                 err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
265                 if (err) {
266                         /*
267                          * Because AS_EIO is cleared by
268                          * filemap_fdatawait_range(), set it again so
269                          * that user process can get -EIO from fsync().
270                          */
271                         set_bit(AS_EIO,
272                                 &jinode->i_vfs_inode->i_mapping->flags);
273
274                         if (!ret)
275                                 ret = err;
276                 }
277                 spin_lock(&journal->j_list_lock);
278                 clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
279                 smp_mb__after_clear_bit();
280                 wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
281         }
282
283         /* Now refile inode to proper lists */
284         list_for_each_entry_safe(jinode, next_i,
285                                  &commit_transaction->t_inode_list, i_list) {
286                 list_del(&jinode->i_list);
287                 if (jinode->i_next_transaction) {
288                         jinode->i_transaction = jinode->i_next_transaction;
289                         jinode->i_next_transaction = NULL;
290                         list_add(&jinode->i_list,
291                                 &jinode->i_transaction->t_inode_list);
292                 } else {
293                         jinode->i_transaction = NULL;
294                 }
295         }
296         spin_unlock(&journal->j_list_lock);
297
298         return ret;
299 }
300
301 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
302 {
303         struct page *page = bh->b_page;
304         char *addr;
305         __u32 checksum;
306
307         addr = kmap_atomic(page);
308         checksum = crc32_be(crc32_sum,
309                 (void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
310         kunmap_atomic(addr);
311
312         return checksum;
313 }
314
315 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
316                                    unsigned long long block)
317 {
318         tag->t_blocknr = cpu_to_be32(block & (u32)~0);
319         if (tag_bytes > JBD2_TAG_SIZE32)
320                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
321 }
322
323 static void jbd2_descr_block_csum_set(journal_t *j,
324                                       struct journal_head *descriptor)
325 {
326         struct jbd2_journal_block_tail *tail;
327         __u32 csum;
328
329         if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
330                 return;
331
332         tail = (struct jbd2_journal_block_tail *)
333                         (jh2bh(descriptor)->b_data + j->j_blocksize -
334                         sizeof(struct jbd2_journal_block_tail));
335         tail->t_checksum = 0;
336         csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
337                            j->j_blocksize);
338         tail->t_checksum = cpu_to_be32(csum);
339 }
340
341 /*
342  * jbd2_journal_commit_transaction
343  *
344  * The primary function for committing a transaction to the log.  This
345  * function is called by the journal thread to begin a complete commit.
346  */
347 void jbd2_journal_commit_transaction(journal_t *journal)
348 {
349         struct transaction_stats_s stats;
350         transaction_t *commit_transaction;
351         struct journal_head *jh, *new_jh, *descriptor;
352         struct buffer_head **wbuf = journal->j_wbuf;
353         int bufs;
354         int flags;
355         int err;
356         unsigned long long blocknr;
357         ktime_t start_time;
358         u64 commit_time;
359         char *tagp = NULL;
360         journal_header_t *header;
361         journal_block_tag_t *tag = NULL;
362         int space_left = 0;
363         int first_tag = 0;
364         int tag_flag;
365         int i, to_free = 0;
366         int tag_bytes = journal_tag_bytes(journal);
367         struct buffer_head *cbh = NULL; /* For transactional checksums */
368         __u32 crc32_sum = ~0;
369         struct blk_plug plug;
370         /* Tail of the journal */
371         unsigned long first_block;
372         tid_t first_tid;
373         int update_tail;
374         int csum_size = 0;
375
376         if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
377                 csum_size = sizeof(struct jbd2_journal_block_tail);
378
379         /*
380          * First job: lock down the current transaction and wait for
381          * all outstanding updates to complete.
382          */
383
384         /* Do we need to erase the effects of a prior jbd2_journal_flush? */
385         if (journal->j_flags & JBD2_FLUSHED) {
386                 jbd_debug(3, "super block updated\n");
387                 mutex_lock(&journal->j_checkpoint_mutex);
388                 /*
389                  * We hold j_checkpoint_mutex so tail cannot change under us.
390                  * We don't need any special data guarantees for writing sb
391                  * since journal is empty and it is ok for write to be
392                  * flushed only with transaction commit.
393                  */
394                 jbd2_journal_update_sb_log_tail(journal,
395                                                 journal->j_tail_sequence,
396                                                 journal->j_tail,
397                                                 WRITE_SYNC);
398                 mutex_unlock(&journal->j_checkpoint_mutex);
399         } else {
400                 jbd_debug(3, "superblock not updated\n");
401         }
402
403         J_ASSERT(journal->j_running_transaction != NULL);
404         J_ASSERT(journal->j_committing_transaction == NULL);
405
406         commit_transaction = journal->j_running_transaction;
407         J_ASSERT(commit_transaction->t_state == T_RUNNING);
408
409         trace_jbd2_start_commit(journal, commit_transaction);
410         jbd_debug(1, "JBD2: starting commit of transaction %d\n",
411                         commit_transaction->t_tid);
412
413         write_lock(&journal->j_state_lock);
414         commit_transaction->t_state = T_LOCKED;
415
416         trace_jbd2_commit_locking(journal, commit_transaction);
417         stats.run.rs_wait = commit_transaction->t_max_wait;
418         stats.run.rs_locked = jiffies;
419         stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
420                                               stats.run.rs_locked);
421
422         spin_lock(&commit_transaction->t_handle_lock);
423         while (atomic_read(&commit_transaction->t_updates)) {
424                 DEFINE_WAIT(wait);
425
426                 prepare_to_wait(&journal->j_wait_updates, &wait,
427                                         TASK_UNINTERRUPTIBLE);
428                 if (atomic_read(&commit_transaction->t_updates)) {
429                         spin_unlock(&commit_transaction->t_handle_lock);
430                         write_unlock(&journal->j_state_lock);
431                         schedule();
432                         write_lock(&journal->j_state_lock);
433                         spin_lock(&commit_transaction->t_handle_lock);
434                 }
435                 finish_wait(&journal->j_wait_updates, &wait);
436         }
437         spin_unlock(&commit_transaction->t_handle_lock);
438
439         J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
440                         journal->j_max_transaction_buffers);
441
442         /*
443          * First thing we are allowed to do is to discard any remaining
444          * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
445          * that there are no such buffers: if a large filesystem
446          * operation like a truncate needs to split itself over multiple
447          * transactions, then it may try to do a jbd2_journal_restart() while
448          * there are still BJ_Reserved buffers outstanding.  These must
449          * be released cleanly from the current transaction.
450          *
451          * In this case, the filesystem must still reserve write access
452          * again before modifying the buffer in the new transaction, but
453          * we do not require it to remember exactly which old buffers it
454          * has reserved.  This is consistent with the existing behaviour
455          * that multiple jbd2_journal_get_write_access() calls to the same
456          * buffer are perfectly permissible.
457          */
458         while (commit_transaction->t_reserved_list) {
459                 jh = commit_transaction->t_reserved_list;
460                 JBUFFER_TRACE(jh, "reserved, unused: refile");
461                 /*
462                  * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
463                  * leave undo-committed data.
464                  */
465                 if (jh->b_committed_data) {
466                         struct buffer_head *bh = jh2bh(jh);
467
468                         jbd_lock_bh_state(bh);
469                         jbd2_free(jh->b_committed_data, bh->b_size);
470                         jh->b_committed_data = NULL;
471                         jbd_unlock_bh_state(bh);
472                 }
473                 jbd2_journal_refile_buffer(journal, jh);
474         }
475
476         /*
477          * Now try to drop any written-back buffers from the journal's
478          * checkpoint lists.  We do this *before* commit because it potentially
479          * frees some memory
480          */
481         spin_lock(&journal->j_list_lock);
482         __jbd2_journal_clean_checkpoint_list(journal);
483         spin_unlock(&journal->j_list_lock);
484
485         jbd_debug(3, "JBD2: commit phase 1\n");
486
487         /*
488          * Clear revoked flag to reflect there is no revoked buffers
489          * in the next transaction which is going to be started.
490          */
491         jbd2_clear_buffer_revoked_flags(journal);
492
493         /*
494          * Switch to a new revoke table.
495          */
496         jbd2_journal_switch_revoke_table(journal);
497
498         trace_jbd2_commit_flushing(journal, commit_transaction);
499         stats.run.rs_flushing = jiffies;
500         stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
501                                              stats.run.rs_flushing);
502
503         commit_transaction->t_state = T_FLUSH;
504         journal->j_committing_transaction = commit_transaction;
505         journal->j_running_transaction = NULL;
506         start_time = ktime_get();
507         commit_transaction->t_log_start = journal->j_head;
508         wake_up(&journal->j_wait_transaction_locked);
509         write_unlock(&journal->j_state_lock);
510
511         jbd_debug(3, "JBD2: commit phase 2\n");
512
513         /*
514          * Now start flushing things to disk, in the order they appear
515          * on the transaction lists.  Data blocks go first.
516          */
517         err = journal_submit_data_buffers(journal, commit_transaction);
518         if (err)
519                 jbd2_journal_abort(journal, err);
520
521         blk_start_plug(&plug);
522         jbd2_journal_write_revoke_records(journal, commit_transaction,
523                                           WRITE_SYNC);
524         blk_finish_plug(&plug);
525
526         jbd_debug(3, "JBD2: commit phase 2\n");
527
528         /*
529          * Way to go: we have now written out all of the data for a
530          * transaction!  Now comes the tricky part: we need to write out
531          * metadata.  Loop over the transaction's entire buffer list:
532          */
533         write_lock(&journal->j_state_lock);
534         commit_transaction->t_state = T_COMMIT;
535         write_unlock(&journal->j_state_lock);
536
537         trace_jbd2_commit_logging(journal, commit_transaction);
538         stats.run.rs_logging = jiffies;
539         stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
540                                                stats.run.rs_logging);
541         stats.run.rs_blocks =
542                 atomic_read(&commit_transaction->t_outstanding_credits);
543         stats.run.rs_blocks_logged = 0;
544
545         J_ASSERT(commit_transaction->t_nr_buffers <=
546                  atomic_read(&commit_transaction->t_outstanding_credits));
547
548         err = 0;
549         descriptor = NULL;
550         bufs = 0;
551         blk_start_plug(&plug);
552         while (commit_transaction->t_buffers) {
553
554                 /* Find the next buffer to be journaled... */
555
556                 jh = commit_transaction->t_buffers;
557
558                 /* If we're in abort mode, we just un-journal the buffer and
559                    release it. */
560
561                 if (is_journal_aborted(journal)) {
562                         clear_buffer_jbddirty(jh2bh(jh));
563                         JBUFFER_TRACE(jh, "journal is aborting: refile");
564                         jbd2_buffer_abort_trigger(jh,
565                                                   jh->b_frozen_data ?
566                                                   jh->b_frozen_triggers :
567                                                   jh->b_triggers);
568                         jbd2_journal_refile_buffer(journal, jh);
569                         /* If that was the last one, we need to clean up
570                          * any descriptor buffers which may have been
571                          * already allocated, even if we are now
572                          * aborting. */
573                         if (!commit_transaction->t_buffers)
574                                 goto start_journal_io;
575                         continue;
576                 }
577
578                 /* Make sure we have a descriptor block in which to
579                    record the metadata buffer. */
580
581                 if (!descriptor) {
582                         struct buffer_head *bh;
583
584                         J_ASSERT (bufs == 0);
585
586                         jbd_debug(4, "JBD2: get descriptor\n");
587
588                         descriptor = jbd2_journal_get_descriptor_buffer(journal);
589                         if (!descriptor) {
590                                 jbd2_journal_abort(journal, -EIO);
591                                 continue;
592                         }
593
594                         bh = jh2bh(descriptor);
595                         jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
596                                 (unsigned long long)bh->b_blocknr, bh->b_data);
597                         header = (journal_header_t *)&bh->b_data[0];
598                         header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
599                         header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
600                         header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
601
602                         tagp = &bh->b_data[sizeof(journal_header_t)];
603                         space_left = bh->b_size - sizeof(journal_header_t);
604                         first_tag = 1;
605                         set_buffer_jwrite(bh);
606                         set_buffer_dirty(bh);
607                         wbuf[bufs++] = bh;
608
609                         /* Record it so that we can wait for IO
610                            completion later */
611                         BUFFER_TRACE(bh, "ph3: file as descriptor");
612                         jbd2_journal_file_buffer(descriptor, commit_transaction,
613                                         BJ_LogCtl);
614                 }
615
616                 /* Where is the buffer to be written? */
617
618                 err = jbd2_journal_next_log_block(journal, &blocknr);
619                 /* If the block mapping failed, just abandon the buffer
620                    and repeat this loop: we'll fall into the
621                    refile-on-abort condition above. */
622                 if (err) {
623                         jbd2_journal_abort(journal, err);
624                         continue;
625                 }
626
627                 /*
628                  * start_this_handle() uses t_outstanding_credits to determine
629                  * the free space in the log, but this counter is changed
630                  * by jbd2_journal_next_log_block() also.
631                  */
632                 atomic_dec(&commit_transaction->t_outstanding_credits);
633
634                 /* Bump b_count to prevent truncate from stumbling over
635                    the shadowed buffer!  @@@ This can go if we ever get
636                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
637                 atomic_inc(&jh2bh(jh)->b_count);
638
639                 /* Make a temporary IO buffer with which to write it out
640                    (this will requeue both the metadata buffer and the
641                    temporary IO buffer). new_bh goes on BJ_IO*/
642
643                 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
644                 /*
645                  * akpm: jbd2_journal_write_metadata_buffer() sets
646                  * new_bh->b_transaction to commit_transaction.
647                  * We need to clean this up before we release new_bh
648                  * (which is of type BJ_IO)
649                  */
650                 JBUFFER_TRACE(jh, "ph3: write metadata");
651                 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
652                                                       jh, &new_jh, blocknr);
653                 if (flags < 0) {
654                         jbd2_journal_abort(journal, flags);
655                         continue;
656                 }
657                 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
658                 wbuf[bufs++] = jh2bh(new_jh);
659
660                 /* Record the new block's tag in the current descriptor
661                    buffer */
662
663                 tag_flag = 0;
664                 if (flags & 1)
665                         tag_flag |= JBD2_FLAG_ESCAPE;
666                 if (!first_tag)
667                         tag_flag |= JBD2_FLAG_SAME_UUID;
668
669                 tag = (journal_block_tag_t *) tagp;
670                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
671                 tag->t_flags = cpu_to_be16(tag_flag);
672                 tagp += tag_bytes;
673                 space_left -= tag_bytes;
674
675                 if (first_tag) {
676                         memcpy (tagp, journal->j_uuid, 16);
677                         tagp += 16;
678                         space_left -= 16;
679                         first_tag = 0;
680                 }
681
682                 /* If there's no more to do, or if the descriptor is full,
683                    let the IO rip! */
684
685                 if (bufs == journal->j_wbufsize ||
686                     commit_transaction->t_buffers == NULL ||
687                     space_left < tag_bytes + 16 + csum_size) {
688
689                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
690
691                         /* Write an end-of-descriptor marker before
692                            submitting the IOs.  "tag" still points to
693                            the last tag we set up. */
694
695                         tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
696
697                         jbd2_descr_block_csum_set(journal, descriptor);
698 start_journal_io:
699                         for (i = 0; i < bufs; i++) {
700                                 struct buffer_head *bh = wbuf[i];
701                                 /*
702                                  * Compute checksum.
703                                  */
704                                 if (JBD2_HAS_COMPAT_FEATURE(journal,
705                                         JBD2_FEATURE_COMPAT_CHECKSUM)) {
706                                         crc32_sum =
707                                             jbd2_checksum_data(crc32_sum, bh);
708                                 }
709
710                                 lock_buffer(bh);
711                                 clear_buffer_dirty(bh);
712                                 set_buffer_uptodate(bh);
713                                 bh->b_end_io = journal_end_buffer_io_sync;
714                                 submit_bh(WRITE_SYNC, bh);
715                         }
716                         cond_resched();
717                         stats.run.rs_blocks_logged += bufs;
718
719                         /* Force a new descriptor to be generated next
720                            time round the loop. */
721                         descriptor = NULL;
722                         bufs = 0;
723                 }
724         }
725
726         err = journal_finish_inode_data_buffers(journal, commit_transaction);
727         if (err) {
728                 printk(KERN_WARNING
729                         "JBD2: Detected IO errors while flushing file data "
730                        "on %s\n", journal->j_devname);
731                 if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
732                         jbd2_journal_abort(journal, err);
733                 err = 0;
734         }
735
736         /*
737          * Get current oldest transaction in the log before we issue flush
738          * to the filesystem device. After the flush we can be sure that
739          * blocks of all older transactions are checkpointed to persistent
740          * storage and we will be safe to update journal start in the
741          * superblock with the numbers we get here.
742          */
743         update_tail =
744                 jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
745
746         write_lock(&journal->j_state_lock);
747         if (update_tail) {
748                 long freed = first_block - journal->j_tail;
749
750                 if (first_block < journal->j_tail)
751                         freed += journal->j_last - journal->j_first;
752                 /* Update tail only if we free significant amount of space */
753                 if (freed < journal->j_maxlen / 4)
754                         update_tail = 0;
755         }
756         J_ASSERT(commit_transaction->t_state == T_COMMIT);
757         commit_transaction->t_state = T_COMMIT_DFLUSH;
758         write_unlock(&journal->j_state_lock);
759
760         /* 
761          * If the journal is not located on the file system device,
762          * then we must flush the file system device before we issue
763          * the commit record
764          */
765         if (commit_transaction->t_need_data_flush &&
766             (journal->j_fs_dev != journal->j_dev) &&
767             (journal->j_flags & JBD2_BARRIER))
768                 blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
769
770         /* Done it all: now write the commit record asynchronously. */
771         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
772                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
773                 err = journal_submit_commit_record(journal, commit_transaction,
774                                                  &cbh, crc32_sum);
775                 if (err)
776                         __jbd2_journal_abort_hard(journal);
777         }
778
779         blk_finish_plug(&plug);
780
781         /* Lo and behold: we have just managed to send a transaction to
782            the log.  Before we can commit it, wait for the IO so far to
783            complete.  Control buffers being written are on the
784            transaction's t_log_list queue, and metadata buffers are on
785            the t_iobuf_list queue.
786
787            Wait for the buffers in reverse order.  That way we are
788            less likely to be woken up until all IOs have completed, and
789            so we incur less scheduling load.
790         */
791
792         jbd_debug(3, "JBD2: commit phase 3\n");
793
794         /*
795          * akpm: these are BJ_IO, and j_list_lock is not needed.
796          * See __journal_try_to_free_buffer.
797          */
798 wait_for_iobuf:
799         while (commit_transaction->t_iobuf_list != NULL) {
800                 struct buffer_head *bh;
801
802                 jh = commit_transaction->t_iobuf_list->b_tprev;
803                 bh = jh2bh(jh);
804                 if (buffer_locked(bh)) {
805                         wait_on_buffer(bh);
806                         goto wait_for_iobuf;
807                 }
808                 if (cond_resched())
809                         goto wait_for_iobuf;
810
811                 if (unlikely(!buffer_uptodate(bh)))
812                         err = -EIO;
813
814                 clear_buffer_jwrite(bh);
815
816                 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
817                 jbd2_journal_unfile_buffer(journal, jh);
818
819                 /*
820                  * ->t_iobuf_list should contain only dummy buffer_heads
821                  * which were created by jbd2_journal_write_metadata_buffer().
822                  */
823                 BUFFER_TRACE(bh, "dumping temporary bh");
824                 jbd2_journal_put_journal_head(jh);
825                 __brelse(bh);
826                 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
827                 free_buffer_head(bh);
828
829                 /* We also have to unlock and free the corresponding
830                    shadowed buffer */
831                 jh = commit_transaction->t_shadow_list->b_tprev;
832                 bh = jh2bh(jh);
833                 clear_bit(BH_JWrite, &bh->b_state);
834                 J_ASSERT_BH(bh, buffer_jbddirty(bh));
835
836                 /* The metadata is now released for reuse, but we need
837                    to remember it against this transaction so that when
838                    we finally commit, we can do any checkpointing
839                    required. */
840                 JBUFFER_TRACE(jh, "file as BJ_Forget");
841                 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
842                 /*
843                  * Wake up any transactions which were waiting for this IO to
844                  * complete. The barrier must be here so that changes by
845                  * jbd2_journal_file_buffer() take effect before wake_up_bit()
846                  * does the waitqueue check.
847                  */
848                 smp_mb();
849                 wake_up_bit(&bh->b_state, BH_Unshadow);
850                 JBUFFER_TRACE(jh, "brelse shadowed buffer");
851                 __brelse(bh);
852         }
853
854         J_ASSERT (commit_transaction->t_shadow_list == NULL);
855
856         jbd_debug(3, "JBD2: commit phase 4\n");
857
858         /* Here we wait for the revoke record and descriptor record buffers */
859  wait_for_ctlbuf:
860         while (commit_transaction->t_log_list != NULL) {
861                 struct buffer_head *bh;
862
863                 jh = commit_transaction->t_log_list->b_tprev;
864                 bh = jh2bh(jh);
865                 if (buffer_locked(bh)) {
866                         wait_on_buffer(bh);
867                         goto wait_for_ctlbuf;
868                 }
869                 if (cond_resched())
870                         goto wait_for_ctlbuf;
871
872                 if (unlikely(!buffer_uptodate(bh)))
873                         err = -EIO;
874
875                 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
876                 clear_buffer_jwrite(bh);
877                 jbd2_journal_unfile_buffer(journal, jh);
878                 jbd2_journal_put_journal_head(jh);
879                 __brelse(bh);           /* One for getblk */
880                 /* AKPM: bforget here */
881         }
882
883         if (err)
884                 jbd2_journal_abort(journal, err);
885
886         jbd_debug(3, "JBD2: commit phase 5\n");
887         write_lock(&journal->j_state_lock);
888         J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
889         commit_transaction->t_state = T_COMMIT_JFLUSH;
890         write_unlock(&journal->j_state_lock);
891
892         if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
893                                        JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
894                 err = journal_submit_commit_record(journal, commit_transaction,
895                                                 &cbh, crc32_sum);
896                 if (err)
897                         __jbd2_journal_abort_hard(journal);
898         }
899         if (cbh)
900                 err = journal_wait_on_commit_record(journal, cbh);
901         if (JBD2_HAS_INCOMPAT_FEATURE(journal,
902                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
903             journal->j_flags & JBD2_BARRIER) {
904                 blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
905         }
906
907         if (err)
908                 jbd2_journal_abort(journal, err);
909
910         /*
911          * Now disk caches for filesystem device are flushed so we are safe to
912          * erase checkpointed transactions from the log by updating journal
913          * superblock.
914          */
915         if (update_tail)
916                 jbd2_update_log_tail(journal, first_tid, first_block);
917
918         /* End of a transaction!  Finally, we can do checkpoint
919            processing: any buffers committed as a result of this
920            transaction can be removed from any checkpoint list it was on
921            before. */
922
923         jbd_debug(3, "JBD2: commit phase 6\n");
924
925         J_ASSERT(list_empty(&commit_transaction->t_inode_list));
926         J_ASSERT(commit_transaction->t_buffers == NULL);
927         J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
928         J_ASSERT(commit_transaction->t_iobuf_list == NULL);
929         J_ASSERT(commit_transaction->t_shadow_list == NULL);
930         J_ASSERT(commit_transaction->t_log_list == NULL);
931
932 restart_loop:
933         /*
934          * As there are other places (journal_unmap_buffer()) adding buffers
935          * to this list we have to be careful and hold the j_list_lock.
936          */
937         spin_lock(&journal->j_list_lock);
938         while (commit_transaction->t_forget) {
939                 transaction_t *cp_transaction;
940                 struct buffer_head *bh;
941                 int try_to_free = 0;
942
943                 jh = commit_transaction->t_forget;
944                 spin_unlock(&journal->j_list_lock);
945                 bh = jh2bh(jh);
946                 /*
947                  * Get a reference so that bh cannot be freed before we are
948                  * done with it.
949                  */
950                 get_bh(bh);
951                 jbd_lock_bh_state(bh);
952                 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction);
953
954                 /*
955                  * If there is undo-protected committed data against
956                  * this buffer, then we can remove it now.  If it is a
957                  * buffer needing such protection, the old frozen_data
958                  * field now points to a committed version of the
959                  * buffer, so rotate that field to the new committed
960                  * data.
961                  *
962                  * Otherwise, we can just throw away the frozen data now.
963                  *
964                  * We also know that the frozen data has already fired
965                  * its triggers if they exist, so we can clear that too.
966                  */
967                 if (jh->b_committed_data) {
968                         jbd2_free(jh->b_committed_data, bh->b_size);
969                         jh->b_committed_data = NULL;
970                         if (jh->b_frozen_data) {
971                                 jh->b_committed_data = jh->b_frozen_data;
972                                 jh->b_frozen_data = NULL;
973                                 jh->b_frozen_triggers = NULL;
974                         }
975                 } else if (jh->b_frozen_data) {
976                         jbd2_free(jh->b_frozen_data, bh->b_size);
977                         jh->b_frozen_data = NULL;
978                         jh->b_frozen_triggers = NULL;
979                 }
980
981                 spin_lock(&journal->j_list_lock);
982                 cp_transaction = jh->b_cp_transaction;
983                 if (cp_transaction) {
984                         JBUFFER_TRACE(jh, "remove from old cp transaction");
985                         cp_transaction->t_chp_stats.cs_dropped++;
986                         __jbd2_journal_remove_checkpoint(jh);
987                 }
988
989                 /* Only re-checkpoint the buffer_head if it is marked
990                  * dirty.  If the buffer was added to the BJ_Forget list
991                  * by jbd2_journal_forget, it may no longer be dirty and
992                  * there's no point in keeping a checkpoint record for
993                  * it. */
994
995                 /* A buffer which has been freed while still being
996                  * journaled by a previous transaction may end up still
997                  * being dirty here, but we want to avoid writing back
998                  * that buffer in the future after the "add to orphan"
999                  * operation been committed,  That's not only a performance
1000                  * gain, it also stops aliasing problems if the buffer is
1001                  * left behind for writeback and gets reallocated for another
1002                  * use in a different page. */
1003                 if (buffer_freed(bh) && !jh->b_next_transaction) {
1004                         clear_buffer_freed(bh);
1005                         clear_buffer_jbddirty(bh);
1006                 }
1007
1008                 if (buffer_jbddirty(bh)) {
1009                         JBUFFER_TRACE(jh, "add to new checkpointing trans");
1010                         __jbd2_journal_insert_checkpoint(jh, commit_transaction);
1011                         if (is_journal_aborted(journal))
1012                                 clear_buffer_jbddirty(bh);
1013                 } else {
1014                         J_ASSERT_BH(bh, !buffer_dirty(bh));
1015                         /*
1016                          * The buffer on BJ_Forget list and not jbddirty means
1017                          * it has been freed by this transaction and hence it
1018                          * could not have been reallocated until this
1019                          * transaction has committed. *BUT* it could be
1020                          * reallocated once we have written all the data to
1021                          * disk and before we process the buffer on BJ_Forget
1022                          * list.
1023                          */
1024                         if (!jh->b_next_transaction)
1025                                 try_to_free = 1;
1026                 }
1027                 JBUFFER_TRACE(jh, "refile or unfile buffer");
1028                 __jbd2_journal_refile_buffer(jh);
1029                 jbd_unlock_bh_state(bh);
1030                 if (try_to_free)
1031                         release_buffer_page(bh);        /* Drops bh reference */
1032                 else
1033                         __brelse(bh);
1034                 cond_resched_lock(&journal->j_list_lock);
1035         }
1036         spin_unlock(&journal->j_list_lock);
1037         /*
1038          * This is a bit sleazy.  We use j_list_lock to protect transition
1039          * of a transaction into T_FINISHED state and calling
1040          * __jbd2_journal_drop_transaction(). Otherwise we could race with
1041          * other checkpointing code processing the transaction...
1042          */
1043         write_lock(&journal->j_state_lock);
1044         spin_lock(&journal->j_list_lock);
1045         /*
1046          * Now recheck if some buffers did not get attached to the transaction
1047          * while the lock was dropped...
1048          */
1049         if (commit_transaction->t_forget) {
1050                 spin_unlock(&journal->j_list_lock);
1051                 write_unlock(&journal->j_state_lock);
1052                 goto restart_loop;
1053         }
1054
1055         /* Done with this transaction! */
1056
1057         jbd_debug(3, "JBD2: commit phase 7\n");
1058
1059         J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1060
1061         commit_transaction->t_start = jiffies;
1062         stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1063                                               commit_transaction->t_start);
1064
1065         /*
1066          * File the transaction statistics
1067          */
1068         stats.ts_tid = commit_transaction->t_tid;
1069         stats.run.rs_handle_count =
1070                 atomic_read(&commit_transaction->t_handle_count);
1071         trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1072                              commit_transaction->t_tid, &stats.run);
1073
1074         /*
1075          * Calculate overall stats
1076          */
1077         spin_lock(&journal->j_history_lock);
1078         journal->j_stats.ts_tid++;
1079         journal->j_stats.run.rs_wait += stats.run.rs_wait;
1080         journal->j_stats.run.rs_running += stats.run.rs_running;
1081         journal->j_stats.run.rs_locked += stats.run.rs_locked;
1082         journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1083         journal->j_stats.run.rs_logging += stats.run.rs_logging;
1084         journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1085         journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1086         journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1087         spin_unlock(&journal->j_history_lock);
1088
1089         commit_transaction->t_state = T_FINISHED;
1090         J_ASSERT(commit_transaction == journal->j_committing_transaction);
1091         journal->j_commit_sequence = commit_transaction->t_tid;
1092         journal->j_committing_transaction = NULL;
1093         commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1094
1095         /*
1096          * weight the commit time higher than the average time so we don't
1097          * react too strongly to vast changes in the commit time
1098          */
1099         if (likely(journal->j_average_commit_time))
1100                 journal->j_average_commit_time = (commit_time +
1101                                 journal->j_average_commit_time*3) / 4;
1102         else
1103                 journal->j_average_commit_time = commit_time;
1104         write_unlock(&journal->j_state_lock);
1105
1106         if (commit_transaction->t_checkpoint_list == NULL &&
1107             commit_transaction->t_checkpoint_io_list == NULL) {
1108                 __jbd2_journal_drop_transaction(journal, commit_transaction);
1109                 to_free = 1;
1110         } else {
1111                 if (journal->j_checkpoint_transactions == NULL) {
1112                         journal->j_checkpoint_transactions = commit_transaction;
1113                         commit_transaction->t_cpnext = commit_transaction;
1114                         commit_transaction->t_cpprev = commit_transaction;
1115                 } else {
1116                         commit_transaction->t_cpnext =
1117                                 journal->j_checkpoint_transactions;
1118                         commit_transaction->t_cpprev =
1119                                 commit_transaction->t_cpnext->t_cpprev;
1120                         commit_transaction->t_cpnext->t_cpprev =
1121                                 commit_transaction;
1122                         commit_transaction->t_cpprev->t_cpnext =
1123                                 commit_transaction;
1124                 }
1125         }
1126         spin_unlock(&journal->j_list_lock);
1127
1128         if (journal->j_commit_callback)
1129                 journal->j_commit_callback(journal, commit_transaction);
1130
1131         trace_jbd2_end_commit(journal, commit_transaction);
1132         jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1133                   journal->j_commit_sequence, journal->j_tail_sequence);
1134         if (to_free)
1135                 jbd2_journal_free_transaction(commit_transaction);
1136
1137         wake_up(&journal->j_wait_done_commit);
1138 }