Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Oct 2008 23:10:29 +0000 (16:10 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 12 Oct 2008 23:10:29 +0000 (16:10 -0700)
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4:
  ext4: fix kconfig typo and extra whitespace
  ext4: fix build failure without procfs
  ext4: add an option to control error handling on file data
  jbd2: don't dirty original metadata buffer on abort
  ext4: add checks for errors from jbd2
  jbd2: fix error handling for checkpoint io
  jbd2: abort when failed to log metadata buffers

Documentation/filesystems/ext4.txt
fs/Kconfig
fs/ext4/ext4.h
fs/ext4/ioctl.c
fs/ext4/super.c
fs/jbd2/checkpoint.c
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/recovery.c
include/linux/jbd2.h

index 74484e6964052394bc9e3381c6ddd010f050d7fb..eb154ef36c2a4b708004ff08660f2a7ebe971fa7 100644 (file)
@@ -223,6 +223,11 @@ errors=remount-ro(*)       Remount the filesystem read-only on an error.
 errors=continue                Keep going on a filesystem error.
 errors=panic           Panic and halt the machine if an error occurs.
 
+data_err=ignore(*)     Just print an error message if an error occurs
+                       in a file data buffer in ordered mode.
+data_err=abort         Abort the journal if an error occurs in a file
+                       data buffer in ordered mode.
+
 grpid                  Give objects the same group ID as their creator.
 bsdgroups
 
index 40183d94b6834a6f1af0d92f1f90f5b50330afac..f54a157a029689fae2556c2f3989f6475e123ec7 100644 (file)
@@ -170,8 +170,8 @@ config EXT4DEV_COMPAT
        help
          Starting with 2.6.28, the name of the ext4 filesystem was
          renamed from ext4dev to ext4.  Unfortunately there are some
-         lagecy userspace programs (such as klibc's fstype) have
-         "ext4dev" hardcoded.  
+         legacy userspace programs (such as klibc's fstype) have
+         "ext4dev" hardcoded.
 
          To enable backwards compatibility so that systems that are
          still expecting to mount ext4 filesystems using ext4dev,
index f46a513a515732c39d437f0e85ddc55549b209f2..6690a41cdd9fc8ca6536d3a6077222e65830a834 100644 (file)
@@ -540,6 +540,8 @@ do {                                                                               \
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT        0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC            0x8000000 /* Delalloc support */
+#define EXT4_MOUNT_DATA_ERR_ABORT      0x10000000 /* Abort on file data write */
+
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
 #define clear_opt(o, opt)              o &= ~EXT4_MOUNT_##opt
index ea27eaa0cfe5292ea4630644ed0ac8a81225424c..dc99b4776d58f052806b322b1bd91f8c7765ff83 100644 (file)
@@ -192,7 +192,7 @@ setversion_out:
        case EXT4_IOC_GROUP_EXTEND: {
                ext4_fsblk_t n_blocks_count;
                struct super_block *sb = inode->i_sb;
-               int err;
+               int err, err2;
 
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -206,8 +206,10 @@ setversion_out:
 
                err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-               jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+               err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+               if (err == 0)
+                       err = err2;
                mnt_drop_write(filp->f_path.mnt);
 
                return err;
@@ -215,7 +217,7 @@ setversion_out:
        case EXT4_IOC_GROUP_ADD: {
                struct ext4_new_group_data input;
                struct super_block *sb = inode->i_sb;
-               int err;
+               int err, err2;
 
                if (!capable(CAP_SYS_RESOURCE))
                        return -EPERM;
@@ -230,8 +232,10 @@ setversion_out:
 
                err = ext4_group_add(sb, &input);
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-               jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+               err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+               if (err == 0)
+                       err = err2;
                mnt_drop_write(filp->f_path.mnt);
 
                return err;
index 0e661c569660b4cb331a8c11c2cd45852ddb78bd..fb940c22ab0da95960f3c5bb757d79ef5a6222a2 100644 (file)
@@ -507,7 +507,8 @@ static void ext4_put_super(struct super_block *sb)
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
-       jbd2_journal_destroy(sbi->s_journal);
+       if (jbd2_journal_destroy(sbi->s_journal) < 0)
+               ext4_abort(sb, __func__, "Couldn't clean up the journal");
        sbi->s_journal = NULL;
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -777,6 +778,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",inode_readahead_blks=%u",
                           sbi->s_inode_readahead_blks);
 
+       if (test_opt(sb, DATA_ERR_ABORT))
+               seq_puts(seq, ",data_err=abort");
+
        ext4_show_quota_options(seq, sb);
        return 0;
 }
@@ -906,6 +910,7 @@ enum {
        Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
+       Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
@@ -952,6 +957,8 @@ static match_table_t tokens = {
        {Opt_data_journal, "data=journal"},
        {Opt_data_ordered, "data=ordered"},
        {Opt_data_writeback, "data=writeback"},
+       {Opt_data_err_abort, "data_err=abort"},
+       {Opt_data_err_ignore, "data_err=ignore"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -1186,6 +1193,12 @@ static int parse_options(char *options, struct super_block *sb,
                                sbi->s_mount_opt |= data_opt;
                        }
                        break;
+               case Opt_data_err_abort:
+                       set_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                       break;
+               case Opt_data_err_ignore:
+                       clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
+                       break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
                        qtype = USRQUOTA;
@@ -2218,6 +2231,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
 
+#ifdef CONFIG_PROC_FS
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 
@@ -2225,6 +2239,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
                                 &ext4_ui_proc_fops,
                                 &sbi->s_inode_readahead_blks);
+#endif
 
        bgl_lock_init(&sbi->s_blockgroup_lock);
 
@@ -2534,6 +2549,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
                journal->j_flags |= JBD2_BARRIER;
        else
                journal->j_flags &= ~JBD2_BARRIER;
+       if (test_opt(sb, DATA_ERR_ABORT))
+               journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
+       else
+               journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
        spin_unlock(&journal->j_state_lock);
 }
 
@@ -2853,7 +2872,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
        journal_t *journal = EXT4_SB(sb)->s_journal;
 
        jbd2_journal_lock_updates(journal);
-       jbd2_journal_flush(journal);
+       if (jbd2_journal_flush(journal) < 0)
+               goto out;
+
        lock_super(sb);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
@@ -2862,6 +2883,8 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
                ext4_commit_super(sb, es, 1);
        }
        unlock_super(sb);
+
+out:
        jbd2_journal_unlock_updates(journal);
 }
 
@@ -2962,7 +2985,13 @@ static void ext4_write_super_lockfs(struct super_block *sb)
 
                /* Now we set up the journal barrier. */
                jbd2_journal_lock_updates(journal);
-               jbd2_journal_flush(journal);
+
+               /*
+                * We don't want to clear needs_recovery flag when we failed
+                * to flush the journal.
+                */
+               if (jbd2_journal_flush(journal) < 0)
+                       return;
 
                /* Journal blocked and flushed, clear needs_recovery flag. */
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
@@ -3402,8 +3431,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                 * otherwise be livelocked...
                 */
                jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
-               jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+               err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
                jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+               if (err) {
+                       path_put(&nd.path);
+                       return err;
+               }
        }
 
        err = vfs_quota_on_path(sb, type, format_id, &nd.path);
index 42895d3694581885de894790f505557fda891c24..9203c3332f170887a2aff921c81946826ddd1cf9 100644 (file)
@@ -94,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
        int ret = 0;
        struct buffer_head *bh = jh2bh(jh);
 
-       if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) {
+       if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+           !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
                JBUFFER_TRACE(jh, "remove from checkpoint list");
                ret = __jbd2_journal_remove_checkpoint(jh) + 1;
                jbd_unlock_bh_state(bh);
@@ -176,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
  * buffers. Note that we take the buffers in the opposite ordering
  * from the one in which they were submitted for IO.
  *
+ * Return 0 on success, and return <0 if some buffers have failed
+ * to be written out.
+ *
  * Called with j_list_lock held.
  */
-static void __wait_cp_io(journal_t *journal, transaction_t *transaction)
+static int __wait_cp_io(journal_t *journal, transaction_t *transaction)
 {
        struct journal_head *jh;
        struct buffer_head *bh;
        tid_t this_tid;
        int released = 0;
+       int ret = 0;
 
        this_tid = transaction->t_tid;
 restart:
        /* Did somebody clean up the transaction in the meanwhile? */
        if (journal->j_checkpoint_transactions != transaction ||
                        transaction->t_tid != this_tid)
-               return;
+               return ret;
        while (!released && transaction->t_checkpoint_io_list) {
                jh = transaction->t_checkpoint_io_list;
                bh = jh2bh(jh);
@@ -210,6 +215,9 @@ restart:
                        spin_lock(&journal->j_list_lock);
                        goto restart;
                }
+               if (unlikely(buffer_write_io_error(bh)))
+                       ret = -EIO;
+
                /*
                 * Now in whatever state the buffer currently is, we know that
                 * it has been written out and so we can drop it from the list
@@ -219,6 +227,8 @@ restart:
                jbd2_journal_remove_journal_head(bh);
                __brelse(bh);
        }
+
+       return ret;
 }
 
 #define NR_BATCH       64
@@ -242,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count)
  * Try to flush one buffer from the checkpoint list to disk.
  *
  * Return 1 if something happened which requires us to abort the current
- * scan of the checkpoint list.
+ * scan of the checkpoint list.  Return <0 if the buffer has failed to
+ * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
  * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
@@ -274,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                jbd2_log_wait_commit(journal, tid);
                ret = 1;
        } else if (!buffer_dirty(bh)) {
+               ret = 1;
+               if (unlikely(buffer_write_io_error(bh)))
+                       ret = -EIO;
                J_ASSERT_JH(jh, !buffer_jbddirty(bh));
                BUFFER_TRACE(bh, "remove from checkpoint");
                __jbd2_journal_remove_checkpoint(jh);
@@ -281,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
                jbd_unlock_bh_state(bh);
                jbd2_journal_remove_journal_head(bh);
                __brelse(bh);
-               ret = 1;
        } else {
                /*
                 * Important: we are about to write the buffer, and
@@ -314,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh,
  * to disk. We submit larger chunks of data at once.
  *
  * The journal should be locked before calling this function.
+ * Called with j_checkpoint_mutex held.
  */
 int jbd2_log_do_checkpoint(journal_t *journal)
 {
@@ -339,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal)
         * OK, we need to start writing disk blocks.  Take one transaction
         * and write it.
         */
+       result = 0;
        spin_lock(&journal->j_list_lock);
        if (!journal->j_checkpoint_transactions)
                goto out;
@@ -357,7 +372,7 @@ restart:
                int batch_count = 0;
                struct buffer_head *bhs[NR_BATCH];
                struct journal_head *jh;
-               int retry = 0;
+               int retry = 0, err;
 
                while (!retry && transaction->t_checkpoint_list) {
                        struct buffer_head *bh;
@@ -371,6 +386,8 @@ restart:
                        }
                        retry = __process_buffer(journal, jh, bhs, &batch_count,
                                                 transaction);
+                       if (retry < 0 && !result)
+                               result = retry;
                        if (!retry && (need_resched() ||
                                spin_needbreak(&journal->j_list_lock))) {
                                spin_unlock(&journal->j_list_lock);
@@ -395,14 +412,18 @@ restart:
                 * Now we have cleaned up the first transaction's checkpoint
                 * list. Let's clean up the second one
                 */
-               __wait_cp_io(journal, transaction);
+               err = __wait_cp_io(journal, transaction);
+               if (!result)
+                       result = err;
        }
 out:
        spin_unlock(&journal->j_list_lock);
-       result = jbd2_cleanup_journal_tail(journal);
        if (result < 0)
-               return result;
-       return 0;
+               jbd2_journal_abort(journal, result);
+       else
+               result = jbd2_cleanup_journal_tail(journal);
+
+       return (result < 0) ? result : 0;
 }
 
 /*
@@ -418,8 +439,9 @@ out:
  * This is the only part of the journaling code which really needs to be
  * aware of transaction aborts.  Checkpointing involves writing to the
  * main filesystem area rather than to the journal, so it can proceed
- * even in abort state, but we must not update the journal superblock if
- * we have an abort error outstanding.
+ * even in abort state, but we must not update the super block if
+ * checkpointing may have failed.  Otherwise, we would lose some metadata
+ * buffers which should be written-back to the filesystem.
  */
 
 int jbd2_cleanup_journal_tail(journal_t *journal)
@@ -428,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
        tid_t           first_tid;
        unsigned long   blocknr, freed;
 
+       if (is_journal_aborted(journal))
+               return 1;
+
        /* OK, work out the oldest transaction remaining in the log, and
         * the log block it starts at.
         *
index 0d3814a35ed11e19a09beb3007113bfa067d0ecb..0abe02c4242aa82d28aeef435a8f459a4ec729f5 100644 (file)
@@ -504,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                jh = commit_transaction->t_buffers;
 
                /* If we're in abort mode, we just un-journal the buffer and
-                  release it for background writing. */
+                  release it. */
 
                if (is_journal_aborted(journal)) {
+                       clear_buffer_jbddirty(jh2bh(jh));
                        JBUFFER_TRACE(jh, "journal is aborting: refile");
                        jbd2_journal_refile_buffer(journal, jh);
                        /* If that was the last one, we need to clean up
@@ -683,6 +684,8 @@ start_journal_io:
                printk(KERN_WARNING
                        "JBD2: Detected IO errors while flushing file data "
                       "on %s\n", journal->j_devname);
+               if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
+                       jbd2_journal_abort(journal, err);
                err = 0;
        }
 
@@ -783,6 +786,9 @@ wait_for_iobuf:
                /* AKPM: bforget here */
        }
 
+       if (err)
+               jbd2_journal_abort(journal, err);
+
        jbd_debug(3, "JBD: commit phase 5\n");
 
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
@@ -881,6 +887,8 @@ restart_loop:
                if (buffer_jbddirty(bh)) {
                        JBUFFER_TRACE(jh, "add to new checkpointing trans");
                        __jbd2_journal_insert_checkpoint(jh, commit_transaction);
+                       if (is_journal_aborted(journal))
+                               clear_buffer_jbddirty(bh);
                        JBUFFER_TRACE(jh, "refile for checkpoint writeback");
                        __jbd2_journal_refile_buffer(jh);
                        jbd_unlock_bh_state(bh);
index 01c3901c3a07599e592e9bd09282a3d6cd42cf24..783de118de9235583bda263db9c9ce665333c4c3 100644 (file)
@@ -1451,9 +1451,12 @@ recovery_error:
  *
  * Release a journal_t structure once it is no longer in use by the
  * journaled object.
+ * Return <0 if we couldn't clean up the journal.
  */
-void jbd2_journal_destroy(journal_t *journal)
+int jbd2_journal_destroy(journal_t *journal)
 {
+       int err = 0;
+
        /* Wait for the commit thread to wake up and die. */
        journal_kill_thread(journal);
 
@@ -1476,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal)
        J_ASSERT(journal->j_checkpoint_transactions == NULL);
        spin_unlock(&journal->j_list_lock);
 
-       /* We can now mark the journal as empty. */
-       journal->j_tail = 0;
-       journal->j_tail_sequence = ++journal->j_transaction_sequence;
        if (journal->j_sb_buffer) {
-               jbd2_journal_update_superblock(journal, 1);
+               if (!is_journal_aborted(journal)) {
+                       /* We can now mark the journal as empty. */
+                       journal->j_tail = 0;
+                       journal->j_tail_sequence =
+                               ++journal->j_transaction_sequence;
+                       jbd2_journal_update_superblock(journal, 1);
+               } else {
+                       err = -EIO;
+               }
                brelse(journal->j_sb_buffer);
        }
 
@@ -1492,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal)
                jbd2_journal_destroy_revoke(journal);
        kfree(journal->j_wbuf);
        kfree(journal);
+
+       return err;
 }
 
 
@@ -1717,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal)
        spin_lock(&journal->j_list_lock);
        while (!err && journal->j_checkpoint_transactions != NULL) {
                spin_unlock(&journal->j_list_lock);
+               mutex_lock(&journal->j_checkpoint_mutex);
                err = jbd2_log_do_checkpoint(journal);
+               mutex_unlock(&journal->j_checkpoint_mutex);
                spin_lock(&journal->j_list_lock);
        }
        spin_unlock(&journal->j_list_lock);
+
+       if (is_journal_aborted(journal))
+               return -EIO;
+
        jbd2_cleanup_journal_tail(journal);
 
        /* Finally, mark the journal as really needing no recovery.
@@ -1742,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal)
        J_ASSERT(journal->j_head == journal->j_tail);
        J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence);
        spin_unlock(&journal->j_state_lock);
-       return err;
+       return 0;
 }
 
 /**
index 058f50f65b766605eb7c88cb56e80f84156c05d8..73063285b13f7c7d868fb8553690bb2d0cbe5a03 100644 (file)
@@ -225,7 +225,7 @@ do {                                                                        \
  */
 int jbd2_journal_recover(journal_t *journal)
 {
-       int                     err;
+       int                     err, err2;
        journal_superblock_t *  sb;
 
        struct recovery_info    info;
@@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal)
        journal->j_transaction_sequence = ++info.end_transaction;
 
        jbd2_journal_clear_revoke(journal);
-       sync_blockdev(journal->j_fs_dev);
+       err2 = sync_blockdev(journal->j_fs_dev);
+       if (!err)
+               err = err2;
+
        return err;
 }
 
index 66c3499478b57579c8b4a719bdee3b8a2c7c42bb..d2e91ea998fd44b945cb8337cbc0934967b774f9 100644 (file)
@@ -967,6 +967,9 @@ struct journal_s
 #define JBD2_FLUSHED   0x008   /* The journal superblock has been flushed */
 #define JBD2_LOADED    0x010   /* The journal superblock has been loaded */
 #define JBD2_BARRIER   0x020   /* Use IDE barriers */
+#define JBD2_ABORT_ON_SYNCDATA_ERR     0x040   /* Abort the journal on file
+                                                * data write error in ordered
+                                                * mode */
 
 /*
  * Function declarations for the journaling transaction and buffer
@@ -1060,7 +1063,7 @@ extern void          jbd2_journal_clear_features
                   (journal_t *, unsigned long, unsigned long, unsigned long);
 extern int        jbd2_journal_create     (journal_t *);
 extern int        jbd2_journal_load       (journal_t *journal);
-extern void       jbd2_journal_destroy    (journal_t *);
+extern int        jbd2_journal_destroy    (journal_t *);
 extern int        jbd2_journal_recover    (journal_t *journal);
 extern int        jbd2_journal_wipe       (journal_t *, int);
 extern int        jbd2_journal_skip_recovery   (journal_t *);