Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Sep 2013 15:50:26 +0000 (08:50 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 5 Sep 2013 15:50:26 +0000 (08:50 -0700)
Pull vfs pile 1 from Al Viro:
 "Unfortunately, this merge window it'll have a be a lot of small piles -
  my fault, actually, for not keeping #for-next in anything that would
  resemble a sane shape ;-/

  This pile: assorted fixes (the first 3 are -stable fodder, IMO) and
  cleanups + %pd/%pD formats (dentry/file pathname, up to 4 last
  components) + several long-standing patches from various folks.

  There definitely will be a lot more (starting with Miklos'
  check_submount_and_drop() series)"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: (26 commits)
  direct-io: Handle O_(D)SYNC AIO
  direct-io: Implement generic deferred AIO completions
  add formats for dentry/file pathnames
  kvm eventfd: switch to fdget
  powerpc kvm: use fdget
  switch fchmod() to fdget
  switch epoll_ctl() to fdget
  switch copy_module_from_fd() to fdget
  git simplify nilfs check for busy subtree
  ibmasmfs: don't bother passing superblock when not needed
  don't pass superblock to hypfs_{mkdir,create*}
  don't pass superblock to hypfs_diag_create_files
  don't pass superblock to hypfs_vm_create_files()
  oprofile: get rid of pointless forward declarations of struct super_block
  oprofilefs_create_...() do not need superblock argument
  oprofilefs_mkdir() doesn't need superblock argument
  don't bother with passing superblock to oprofile_create_stats_files()
  oprofile: don't bother with passing superblock to ->create_files()
  don't bother passing sb to oprofile_create_files()
  coh901318: don't open-code simple_read_from_buffer()
  ...

1  2 
arch/powerpc/kvm/powerpc.c
fs/dcache.c
fs/ext4/ext4.h
fs/ext4/inode.c
fs/ext4/super.c
fs/namei.c
include/linux/fs.h
kernel/module.c

index f55e14cd1762192e2950689d5ee41e2e35698f2b,aca2e8f2e33fdb32fd32f3106ec6187fa3f78bed..07c0106fab76104048664da03a82c309e7187013
@@@ -117,6 -117,8 +117,6 @@@ int kvmppc_prepare_to_enter(struct kvm_
                        kvm_guest_exit();
                        continue;
                }
 -
 -              trace_hardirqs_on();
  #endif
  
                kvm_guest_enter();
@@@ -418,10 -420,6 +418,10 @@@ int kvm_arch_create_memslot(struct kvm_
        return kvmppc_core_create_memslot(slot, npages);
  }
  
 +void kvm_arch_memslots_updated(struct kvm *kvm)
 +{
 +}
 +
  int kvm_arch_prepare_memory_region(struct kvm *kvm,
                                   struct kvm_memory_slot *memslot,
                                   struct kvm_userspace_memory_region *mem,
@@@ -825,39 -823,39 +825,39 @@@ static int kvm_vcpu_ioctl_enable_cap(st
  #endif
  #ifdef CONFIG_KVM_MPIC
        case KVM_CAP_IRQ_MPIC: {
-               struct file *filp;
+               struct fd f;
                struct kvm_device *dev;
  
                r = -EBADF;
-               filp = fget(cap->args[0]);
-               if (!filp)
+               f = fdget(cap->args[0]);
+               if (!f.file)
                        break;
  
                r = -EPERM;
-               dev = kvm_device_from_filp(filp);
+               dev = kvm_device_from_filp(f.file);
                if (dev)
                        r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]);
  
-               fput(filp);
+               fdput(f);
                break;
        }
  #endif
  #ifdef CONFIG_KVM_XICS
        case KVM_CAP_IRQ_XICS: {
-               struct file *filp;
+               struct fd f;
                struct kvm_device *dev;
  
                r = -EBADF;
-               filp = fget(cap->args[0]);
-               if (!filp)
+               f = fdget(cap->args[0]);
+               if (!f.file)
                        break;
  
                r = -EPERM;
-               dev = kvm_device_from_filp(filp);
+               dev = kvm_device_from_filp(f.file);
                if (dev)
                        r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
  
-               fput(filp);
+               fdput(f);
                break;
        }
  #endif /* CONFIG_KVM_XICS */
diff --combined fs/dcache.c
index 96655f4f45749e530cee80a32c36fb8e6061164b,2e5f9ca8328593bcd0b9dd70b733b58d03d3f90d..5aa53bc056bada2af9c3c6040498abcb9a3a3856
@@@ -229,7 -229,7 +229,7 @@@ static void __d_free(struct rcu_head *h
   */
  static void d_free(struct dentry *dentry)
  {
 -      BUG_ON(dentry->d_count);
 +      BUG_ON(dentry->d_lockref.count);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
@@@ -467,12 -467,12 +467,12 @@@ relock
        }
  
        if (ref)
 -              dentry->d_count--;
 +              dentry->d_lockref.count--;
        /*
         * inform the fs via d_prune that this dentry is about to be
         * unhashed and destroyed.
         */
-       if (dentry->d_flags & DCACHE_OP_PRUNE)
+       if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
                dentry->d_op->d_prune(dentry);
  
        dentry_lru_del(dentry);
@@@ -513,10 -513,15 +513,10 @@@ void dput(struct dentry *dentry
                return;
  
  repeat:
 -      if (dentry->d_count == 1)
 +      if (dentry->d_lockref.count == 1)
                might_sleep();
 -      spin_lock(&dentry->d_lock);
 -      BUG_ON(!dentry->d_count);
 -      if (dentry->d_count > 1) {
 -              dentry->d_count--;
 -              spin_unlock(&dentry->d_lock);
 +      if (lockref_put_or_lock(&dentry->d_lockref))
                return;
 -      }
  
        if (dentry->d_flags & DCACHE_OP_DELETE) {
                if (dentry->d_op->d_delete(dentry))
        dentry->d_flags |= DCACHE_REFERENCED;
        dentry_lru_add(dentry);
  
 -      dentry->d_count--;
 +      dentry->d_lockref.count--;
        spin_unlock(&dentry->d_lock);
        return;
  
@@@ -585,7 -590,7 +585,7 @@@ int d_invalidate(struct dentry * dentry
         * We also need to leave mountpoints alone,
         * directory or not.
         */
 -      if (dentry->d_count > 1 && dentry->d_inode) {
 +      if (dentry->d_lockref.count > 1 && dentry->d_inode) {
                if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) {
                        spin_unlock(&dentry->d_lock);
                        return -EBUSY;
@@@ -601,33 -606,20 +601,33 @@@ EXPORT_SYMBOL(d_invalidate)
  /* This must be called with d_lock held */
  static inline void __dget_dlock(struct dentry *dentry)
  {
 -      dentry->d_count++;
 +      dentry->d_lockref.count++;
  }
  
  static inline void __dget(struct dentry *dentry)
  {
 -      spin_lock(&dentry->d_lock);
 -      __dget_dlock(dentry);
 -      spin_unlock(&dentry->d_lock);
 +      lockref_get(&dentry->d_lockref);
  }
  
  struct dentry *dget_parent(struct dentry *dentry)
  {
 +      int gotref;
        struct dentry *ret;
  
 +      /*
 +       * Do optimistic parent lookup without any
 +       * locking.
 +       */
 +      rcu_read_lock();
 +      ret = ACCESS_ONCE(dentry->d_parent);
 +      gotref = lockref_get_not_zero(&ret->d_lockref);
 +      rcu_read_unlock();
 +      if (likely(gotref)) {
 +              if (likely(ret == ACCESS_ONCE(dentry->d_parent)))
 +                      return ret;
 +              dput(ret);
 +      }
 +
  repeat:
        /*
         * Don't need rcu_dereference because we re-check it was correct under
                goto repeat;
        }
        rcu_read_unlock();
 -      BUG_ON(!ret->d_count);
 -      ret->d_count++;
 +      BUG_ON(!ret->d_lockref.count);
 +      ret->d_lockref.count++;
        spin_unlock(&ret->d_lock);
        return ret;
  }
@@@ -726,7 -718,15 +726,15 @@@ restart
        spin_lock(&inode->i_lock);
        hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) {
                spin_lock(&dentry->d_lock);
 -              if (!dentry->d_count) {
 +              if (!dentry->d_lockref.count) {
+                       /*
+                        * inform the fs via d_prune that this dentry
+                        * is about to be unhashed and destroyed.
+                        */
+                       if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+                           !d_unhashed(dentry))
+                               dentry->d_op->d_prune(dentry);
                        __dget_dlock(dentry);
                        __d_drop(dentry);
                        spin_unlock(&dentry->d_lock);
@@@ -771,8 -771,12 +779,8 @@@ static void try_prune_one_dentry(struc
        /* Prune ancestors. */
        dentry = parent;
        while (dentry) {
 -              spin_lock(&dentry->d_lock);
 -              if (dentry->d_count > 1) {
 -                      dentry->d_count--;
 -                      spin_unlock(&dentry->d_lock);
 +              if (lockref_put_or_lock(&dentry->d_lockref))
                        return;
 -              }
                dentry = dentry_kill(dentry, 1);
        }
  }
@@@ -797,7 -801,7 +805,7 @@@ static void shrink_dentry_list(struct l
                 * the LRU because of laziness during lookup.  Do not free
                 * it - just keep it off the LRU list.
                 */
 -              if (dentry->d_count) {
 +              if (dentry->d_lockref.count) {
                        dentry_lru_del(dentry);
                        spin_unlock(&dentry->d_lock);
                        continue;
@@@ -911,13 -915,14 +919,14 @@@ static void shrink_dcache_for_umount_su
                         * inform the fs that this dentry is about to be
                         * unhashed and destroyed.
                         */
-                       if (dentry->d_flags & DCACHE_OP_PRUNE)
+                       if ((dentry->d_flags & DCACHE_OP_PRUNE) &&
+                           !d_unhashed(dentry))
                                dentry->d_op->d_prune(dentry);
  
                        dentry_lru_del(dentry);
                        __d_shrink(dentry);
  
 -                      if (dentry->d_count != 0) {
 +                      if (dentry->d_lockref.count != 0) {
                                printk(KERN_ERR
                                       "BUG: Dentry %p{i=%lx,n=%s}"
                                       " still in use (%d)"
                                       dentry->d_inode ?
                                       dentry->d_inode->i_ino : 0UL,
                                       dentry->d_name.name,
 -                                     dentry->d_count,
 +                                     dentry->d_lockref.count,
                                       dentry->d_sb->s_type->name,
                                       dentry->d_sb->s_id);
                                BUG();
                                list_del(&dentry->d_u.d_child);
                        } else {
                                parent = dentry->d_parent;
 -                              parent->d_count--;
 +                              parent->d_lockref.count--;
                                list_del(&dentry->d_u.d_child);
                        }
  
@@@ -985,7 -990,7 +994,7 @@@ void shrink_dcache_for_umount(struct su
  
        dentry = sb->s_root;
        sb->s_root = NULL;
 -      dentry->d_count--;
 +      dentry->d_lockref.count--;
        shrink_dcache_for_umount_subtree(dentry);
  
        while (!hlist_bl_empty(&sb->s_anon)) {
@@@ -1151,7 -1156,7 +1160,7 @@@ resume
                 * loop in shrink_dcache_parent() might not make any progress
                 * and loop forever.
                 */
 -              if (dentry->d_count) {
 +              if (dentry->d_lockref.count) {
                        dentry_lru_del(dentry);
                } else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
                        dentry_lru_move_list(dentry, dispose);
@@@ -1273,7 -1278,7 +1282,7 @@@ struct dentry *__d_alloc(struct super_b
        smp_wmb();
        dentry->d_name.name = dname;
  
 -      dentry->d_count = 1;
 +      dentry->d_lockref.count = 1;
        dentry->d_flags = 0;
        spin_lock_init(&dentry->d_lock);
        seqcount_init(&dentry->d_seq);
@@@ -1786,7 -1791,7 +1795,7 @@@ static noinline enum slow_d_compare slo
   * without taking d_lock and checking d_seq sequence count against @seq
   * returned here.
   *
 - * A refcount may be taken on the found dentry with the __d_rcu_to_refcount
 + * A refcount may be taken on the found dentry with the d_rcu_to_refcount
   * function.
   *
   * Alternatively, __d_lookup_rcu may be called again to look up the child of
@@@ -1974,7 -1979,7 +1983,7 @@@ struct dentry *__d_lookup(const struct 
                                goto next;
                }
  
 -              dentry->d_count++;
 +              dentry->d_lockref.count++;
                found = dentry;
                spin_unlock(&dentry->d_lock);
                break;
@@@ -2073,7 -2078,7 +2082,7 @@@ again
        spin_lock(&dentry->d_lock);
        inode = dentry->d_inode;
        isdir = S_ISDIR(inode->i_mode);
 -      if (dentry->d_count == 1) {
 +      if (dentry->d_lockref.count == 1) {
                if (!spin_trylock(&inode->i_lock)) {
                        spin_unlock(&dentry->d_lock);
                        cpu_relax();
@@@ -2952,7 -2957,7 +2961,7 @@@ resume
                }
                if (!(dentry->d_flags & DCACHE_GENOCIDE)) {
                        dentry->d_flags |= DCACHE_GENOCIDE;
 -                      dentry->d_count--;
 +                      dentry->d_lockref.count--;
                }
                spin_unlock(&dentry->d_lock);
        }
                struct dentry *child = this_parent;
                if (!(this_parent->d_flags & DCACHE_GENOCIDE)) {
                        this_parent->d_flags |= DCACHE_GENOCIDE;
 -                      this_parent->d_count--;
 +                      this_parent->d_lockref.count--;
                }
                this_parent = try_to_ascend(this_parent, locked, seq);
                if (!this_parent)
diff --combined fs/ext4/ext4.h
index 06b488dca66643e8f42a24d32b30f0db064df150,b247fbbed99c546e448c71d0df509953712ba5e0..af815ea9d7cc4b6e209e28eea6f9bc97a6f247ce
@@@ -180,7 -180,6 +180,6 @@@ struct ext4_map_blocks 
   * Flags for ext4_io_end->flags
   */
  #define       EXT4_IO_END_UNWRITTEN   0x0001
- #define EXT4_IO_END_DIRECT    0x0002
  
  /*
   * For converting uninitialized extents on a work queue. 'handle' is used for
@@@ -196,8 -195,6 +195,6 @@@ typedef struct ext4_io_end 
        unsigned int            flag;           /* unwritten or not */
        loff_t                  offset;         /* offset in the file */
        ssize_t                 size;           /* size of the extent */
-       struct kiocb            *iocb;          /* iocb struct for AIO */
-       int                     result;         /* error value for AIO */
        atomic_t                count;          /* reference counter */
  } ext4_io_end_t;
  
@@@ -560,18 -557,6 +557,18 @@@ enum 
        /* Do not put hole in extent cache */
  #define EXT4_GET_BLOCKS_NO_PUT_HOLE           0x0200
  
 +/*
 + * The bit position of these flags must not overlap with any of the
 + * EXT4_GET_BLOCKS_*.  They are used by ext4_ext_find_extent(),
 + * read_extent_tree_block(), ext4_split_extent_at(),
 + * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf().
 + * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be
 + * caching the extents when reading from the extent tree while a
 + * truncate or punch hole operation is in progress.
 + */
 +#define EXT4_EX_NOCACHE                               0x0400
 +#define EXT4_EX_FORCE_CACHE                   0x0800
 +
  /*
   * Flags used by ext4_free_blocks
   */
  #define EXT4_FREE_BLOCKS_NO_QUOT_UPDATE       0x0008
  #define EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER 0x0010
  #define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER  0x0020
 +#define EXT4_FREE_BLOCKS_RESERVE              0x0040
  
  /*
   * ioctl commands
  #define EXT4_IOC_MOVE_EXT             _IOWR('f', 15, struct move_extent)
  #define EXT4_IOC_RESIZE_FS            _IOW('f', 16, __u64)
  #define EXT4_IOC_SWAP_BOOT            _IO('f', 17)
 +#define EXT4_IOC_PRECACHE_EXTENTS     _IO('f', 18)
  
  #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
  /*
@@@ -914,11 -897,9 +911,9 @@@ struct ext4_inode_info 
         * Completed IOs that need unwritten extents handling and don't have
         * transaction reserved
         */
-       struct list_head i_unrsv_conversion_list;
        atomic_t i_ioend_count; /* Number of outstanding io_end structs */
        atomic_t i_unwritten; /* Nr. of inflight conversions pending */
        struct work_struct i_rsv_conversion_work;
-       struct work_struct i_unrsv_conversion_work;
  
        spinlock_t i_block_reservation_lock;
  
@@@ -1290,8 -1271,6 +1285,6 @@@ struct ext4_sb_info 
        struct flex_groups *s_flex_groups;
        ext4_group_t s_flex_groups_allocated;
  
-       /* workqueue for unreserved extent convertions (dio) */
-       struct workqueue_struct *unrsv_conversion_wq;
        /* workqueue for reserved extent conversions (buffered io) */
        struct workqueue_struct *rsv_conversion_wq;
  
@@@ -1354,9 -1333,6 +1347,6 @@@ static inline void ext4_set_io_unwritte
                                              struct ext4_io_end *io_end)
  {
        if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
-               /* Writeback has to have coversion transaction reserved */
-               WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
-                       !(io_end->flag & EXT4_IO_END_DIRECT));
                io_end->flag |= EXT4_IO_END_UNWRITTEN;
                atomic_inc(&EXT4_I(inode)->i_unwritten);
        }
@@@ -1389,7 -1365,6 +1379,7 @@@ enum 
                                           nolocking */
        EXT4_STATE_MAY_INLINE_DATA,     /* may have in-inode data */
        EXT4_STATE_ORDERED_MODE,        /* data=ordered mode */
 +      EXT4_STATE_EXT_PRECACHED,       /* extents have been precached */
  };
  
  #define EXT4_INODE_BIT_FNS(name, field, offset)                               \
@@@ -1930,7 -1905,7 +1920,7 @@@ extern ext4_group_t ext4_get_group_numb
  
  extern void ext4_validate_block_bitmap(struct super_block *sb,
                                       struct ext4_group_desc *desc,
 -                                     unsigned int block_group,
 +                                     ext4_group_t block_group,
                                       struct buffer_head *bh);
  extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
@@@ -2432,32 -2407,16 +2422,32 @@@ do {                                                         
  #define EXT4_FREECLUSTERS_WATERMARK 0
  #endif
  
 +/* Update i_disksize. Requires i_mutex to avoid races with truncate */
  static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
  {
 -      /*
 -       * XXX: replace with spinlock if seen contended -bzzz
 -       */
 +      WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
 +                   !mutex_is_locked(&inode->i_mutex));
        down_write(&EXT4_I(inode)->i_data_sem);
        if (newsize > EXT4_I(inode)->i_disksize)
                EXT4_I(inode)->i_disksize = newsize;
        up_write(&EXT4_I(inode)->i_data_sem);
 -      return ;
 +}
 +
 +/*
 + * Update i_disksize after writeback has been started. Races with truncate
 + * are avoided by checking i_size under i_data_sem.
 + */
 +static inline void ext4_wb_update_i_disksize(struct inode *inode, loff_t newsize)
 +{
 +      loff_t i_size;
 +
 +      down_write(&EXT4_I(inode)->i_data_sem);
 +      i_size = i_size_read(inode);
 +      if (newsize > i_size)
 +              newsize = i_size;
 +      if (newsize > EXT4_I(inode)->i_disksize)
 +              EXT4_I(inode)->i_disksize = newsize;
 +      up_write(&EXT4_I(inode)->i_data_sem);
  }
  
  struct ext4_group_info {
  
  #define EXT4_GROUP_INFO_NEED_INIT_BIT         0
  #define EXT4_GROUP_INFO_WAS_TRIMMED_BIT               1
 +#define EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT   2
 +#define EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT   3
  
  #define EXT4_MB_GRP_NEED_INIT(grp)    \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 +#define EXT4_MB_GRP_BBITMAP_CORRUPT(grp)      \
 +      (test_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
 +#define EXT4_MB_GRP_IBITMAP_CORRUPT(grp)      \
 +      (test_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &((grp)->bb_state)))
  
  #define EXT4_MB_GRP_WAS_TRIMMED(grp)  \
        (test_bit(EXT4_GROUP_INFO_WAS_TRIMMED_BIT, &((grp)->bb_state)))
@@@ -2692,12 -2645,6 +2682,12 @@@ extern int ext4_check_blockref(const ch
  struct ext4_ext_path;
  struct ext4_extent;
  
 +/*
 + * Maximum number of logical blocks in a file; ext4_extent's ee_block is
 + * __le32.
 + */
 +#define EXT_MAX_BLOCKS        0xffffffff
 +
  extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
  extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
  extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
@@@ -2727,8 -2674,7 +2717,8 @@@ extern int ext4_ext_insert_extent(handl
                                  struct ext4_ext_path *,
                                  struct ext4_extent *, int);
  extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t,
 -                                                struct ext4_ext_path *);
 +                                                struct ext4_ext_path *,
 +                                                int flags);
  extern void ext4_ext_drop_refs(struct ext4_ext_path *);
  extern int ext4_ext_check_inode(struct inode *inode);
  extern int ext4_find_delalloc_range(struct inode *inode,
  extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk);
  extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 -
 +extern int ext4_ext_precache(struct inode *inode);
  
  /* move_extent.c */
  extern void ext4_double_down_write_data_sem(struct inode *first,
@@@ -2760,7 -2706,6 +2750,6 @@@ extern void ext4_put_io_end_defer(ext4_
  extern void ext4_io_submit_init(struct ext4_io_submit *io,
                                struct writeback_control *wbc);
  extern void ext4_end_io_rsv_work(struct work_struct *work);
- extern void ext4_end_io_unrsv_work(struct work_struct *work);
  extern void ext4_io_submit(struct ext4_io_submit *io);
  extern int ext4_bio_write_page(struct ext4_io_submit *io,
                               struct page *page,
diff --combined fs/ext4/inode.c
index 9115f28075157d0a7ce03071a8869f50a7e82fbd,123bd81692d16d8ac7facafd9988a950e15f8768..c79fd7dabe7953898f64b28101c8057fb7139119
@@@ -553,7 -553,7 +553,7 @@@ int ext4_map_blocks(handle_t *handle, s
        }
        if (retval > 0) {
                int ret;
 -              unsigned long long status;
 +              unsigned int status;
  
                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
@@@ -653,7 -653,7 +653,7 @@@ found
  
        if (retval > 0) {
                int ret;
 -              unsigned long long status;
 +              unsigned int status;
  
                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
@@@ -727,8 -727,12 +727,12 @@@ static int _ext4_get_block(struct inod
  
        ret = ext4_map_blocks(handle, inode, &map, flags);
        if (ret > 0) {
+               ext4_io_end_t *io_end = ext4_inode_aio(inode);
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+               if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
+                       set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
@@@ -969,8 -973,7 +973,8 @@@ retry_journal
                ext4_journal_stop(handle);
                goto retry_grab;
        }
 -      wait_on_page_writeback(page);
 +      /* In case writeback began while the page was unlocked */
 +      wait_for_stable_page(page);
  
        if (ext4_should_dioread_nolock(inode))
                ret = __block_write_begin(page, pos, len, ext4_get_block_write);
@@@ -1634,7 -1637,7 +1638,7 @@@ add_delayed
                set_buffer_delay(bh);
        } else if (retval > 0) {
                int ret;
 -              unsigned long long status;
 +              unsigned int status;
  
                if (unlikely(retval != map->m_len)) {
                        ext4_warning(inode->i_sb,
@@@ -1891,32 -1894,12 +1895,32 @@@ static int ext4_writepage(struct page *
        return ret;
  }
  
 +static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 +{
 +      int len;
 +      loff_t size = i_size_read(mpd->inode);
 +      int err;
 +
 +      BUG_ON(page->index != mpd->first_page);
 +      if (page->index == size >> PAGE_CACHE_SHIFT)
 +              len = size & ~PAGE_CACHE_MASK;
 +      else
 +              len = PAGE_CACHE_SIZE;
 +      clear_page_dirty_for_io(page);
 +      err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
 +      if (!err)
 +              mpd->wbc->nr_to_write--;
 +      mpd->first_page++;
 +
 +      return err;
 +}
 +
  #define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
  
  /*
   * mballoc gives us at most this number of blocks...
   * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
 - * The rest of mballoc seems to handle chunks upto full group size.
 + * The rest of mballoc seems to handle chunks up to full group size.
   */
  #define MAX_WRITEPAGES_EXTENT_LEN 2048
  
   *
   * @mpd - extent of blocks
   * @lblk - logical number of the block in the file
 - * @b_state - b_state of the buffer head added
 + * @bh - buffer head we want to add to the extent
   *
 - * the function is used to collect contig. blocks in same state
 + * The function is used to collect contig. blocks in the same state. If the
 + * buffer doesn't require mapping for writeback and we haven't started the
 + * extent of buffers to map yet, the function returns 'true' immediately - the
 + * caller can write the buffer right away. Otherwise the function returns true
 + * if the block has been added to the extent, false if the block couldn't be
 + * added.
   */
 -static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
 -                                unsigned long b_state)
 +static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
 +                                 struct buffer_head *bh)
  {
        struct ext4_map_blocks *map = &mpd->map;
  
 -      /* Don't go larger than mballoc is willing to allocate */
 -      if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
 -              return 0;
 +      /* Buffer that doesn't need mapping for writeback? */
 +      if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
 +          (!buffer_delay(bh) && !buffer_unwritten(bh))) {
 +              /* So far no extent to map => we write the buffer right away */
 +              if (map->m_len == 0)
 +                      return true;
 +              return false;
 +      }
  
        /* First block in the extent? */
        if (map->m_len == 0) {
                map->m_lblk = lblk;
                map->m_len = 1;
 -              map->m_flags = b_state & BH_FLAGS;
 -              return 1;
 +              map->m_flags = bh->b_state & BH_FLAGS;
 +              return true;
        }
  
 +      /* Don't go larger than mballoc is willing to allocate */
 +      if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
 +              return false;
 +
        /* Can we merge the block to our big extent? */
        if (lblk == map->m_lblk + map->m_len &&
 -          (b_state & BH_FLAGS) == map->m_flags) {
 +          (bh->b_state & BH_FLAGS) == map->m_flags) {
                map->m_len++;
 -              return 1;
 +              return true;
        }
 -      return 0;
 +      return false;
  }
  
 -static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
 -                                  struct buffer_head *head,
 -                                  struct buffer_head *bh,
 -                                  ext4_lblk_t lblk)
 +/*
 + * mpage_process_page_bufs - submit page buffers for IO or add them to extent
 + *
 + * @mpd - extent of blocks for mapping
 + * @head - the first buffer in the page
 + * @bh - buffer we should start processing from
 + * @lblk - logical number of the block in the file corresponding to @bh
 + *
 + * Walk through page buffers from @bh upto @head (exclusive) and either submit
 + * the page for IO if all buffers in this page were mapped and there's no
 + * accumulated extent of buffers to map or add buffers in the page to the
 + * extent of buffers to map. The function returns 1 if the caller can continue
 + * by processing the next page, 0 if it should stop adding buffers to the
 + * extent to map because we cannot extend it anymore. It can also return value
 + * < 0 in case of error during IO submission.
 + */
 +static int mpage_process_page_bufs(struct mpage_da_data *mpd,
 +                                 struct buffer_head *head,
 +                                 struct buffer_head *bh,
 +                                 ext4_lblk_t lblk)
  {
        struct inode *inode = mpd->inode;
 +      int err;
        ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
                                                        >> inode->i_blkbits;
  
        do {
                BUG_ON(buffer_locked(bh));
  
 -              if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
 -                  (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
 -                  lblk >= blocks) {
 +              if (lblk >= blocks || !mpage_add_bh_to_extent(mpd, lblk, bh)) {
                        /* Found extent to map? */
                        if (mpd->map.m_len)
 -                              return false;
 -                      if (lblk >= blocks)
 -                              return true;
 -                      continue;
 +                              return 0;
 +                      /* Everything mapped so far and we hit EOF */
 +                      break;
                }
 -              if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
 -                      return false;
        } while (lblk++, (bh = bh->b_this_page) != head);
 -      return true;
 -}
 -
 -static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
 -{
 -      int len;
 -      loff_t size = i_size_read(mpd->inode);
 -      int err;
 -
 -      BUG_ON(page->index != mpd->first_page);
 -      if (page->index == size >> PAGE_CACHE_SHIFT)
 -              len = size & ~PAGE_CACHE_MASK;
 -      else
 -              len = PAGE_CACHE_SIZE;
 -      clear_page_dirty_for_io(page);
 -      err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
 -      if (!err)
 -              mpd->wbc->nr_to_write--;
 -      mpd->first_page++;
 -
 -      return err;
 +      /* So far everything mapped? Submit the page for IO. */
 +      if (mpd->map.m_len == 0) {
 +              err = mpage_submit_page(mpd, head->b_page);
 +              if (err < 0)
 +                      return err;
 +      }
 +      return lblk < blocks;
  }
  
  /*
@@@ -2036,6 -2007,8 +2040,6 @@@ static int mpage_map_and_submit_buffers
        struct inode *inode = mpd->inode;
        struct buffer_head *head, *bh;
        int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
 -      ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
 -                                                      >> inode->i_blkbits;
        pgoff_t start, end;
        ext4_lblk_t lblk;
        sector_t pblock;
  
                        if (page->index > end)
                                break;
 -                      /* Upto 'end' pages must be contiguous */
 +                      /* Up to 'end' pages must be contiguous */
                        BUG_ON(page->index != start);
                        bh = head = page_buffers(page);
                        do {
                                         */
                                        mpd->map.m_len = 0;
                                        mpd->map.m_flags = 0;
 -                                      add_page_bufs_to_extent(mpd, head, bh,
 -                                                              lblk);
 +                                      /*
 +                                       * FIXME: If dioread_nolock supports
 +                                       * blocksize < pagesize, we need to make
 +                                       * sure we add size mapped so far to
 +                                       * io_end->size as the following call
 +                                       * can submit the page for IO.
 +                                       */
 +                                      err = mpage_process_page_bufs(mpd, head,
 +                                                                    bh, lblk);
                                        pagevec_release(&pvec);
 -                                      return 0;
 +                                      if (err > 0)
 +                                              err = 0;
 +                                      return err;
                                }
                                if (buffer_delay(bh)) {
                                        clear_buffer_delay(bh);
                                        bh->b_blocknr = pblock++;
                                }
                                clear_buffer_unwritten(bh);
 -                      } while (++lblk < blocks &&
 -                               (bh = bh->b_this_page) != head);
 +                      } while (lblk++, (bh = bh->b_this_page) != head);
  
                        /*
                         * FIXME: This is going to break if dioread_nolock
@@@ -2238,10 -2203,12 +2242,10 @@@ static int mpage_map_and_submit_extent(
  
        /* Update on-disk size after IO is submitted */
        disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
 -      if (disksize > i_size_read(inode))
 -              disksize = i_size_read(inode);
        if (disksize > EXT4_I(inode)->i_disksize) {
                int err2;
  
 -              ext4_update_i_disksize(inode, disksize);
 +              ext4_wb_update_i_disksize(inode, disksize);
                err2 = ext4_mark_inode_dirty(handle, inode);
                if (err2)
                        ext4_error(inode->i_sb,
  /*
   * Calculate the total number of credits to reserve for one writepages
   * iteration. This is called from ext4_writepages(). We map an extent of
 - * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
 + * up to MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
   * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
   * bpp - 1 blocks in bpp different extents.
   */
@@@ -2356,10 -2323,14 +2360,10 @@@ static int mpage_prepare_extent_to_map(
                        lblk = ((ext4_lblk_t)page->index) <<
                                (PAGE_CACHE_SHIFT - blkbits);
                        head = page_buffers(page);
 -                      if (!add_page_bufs_to_extent(mpd, head, head, lblk))
 +                      err = mpage_process_page_bufs(mpd, head, head, lblk);
 +                      if (err <= 0)
                                goto out;
 -                      /* So far everything mapped? Submit the page for IO. */
 -                      if (mpd->map.m_len == 0) {
 -                              err = mpage_submit_page(mpd, page);
 -                              if (err < 0)
 -                                      goto out;
 -                      }
 +                      err = 0;
  
                        /*
                         * Accumulated enough dirty pages? This doesn't apply
@@@ -2443,7 -2414,7 +2447,7 @@@ static int ext4_writepages(struct addre
  
        if (ext4_should_dioread_nolock(inode)) {
                /*
 -               * We may need to convert upto one extent per block in
 +               * We may need to convert up to one extent per block in
                 * the page and we may dirty the inode.
                 */
                rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
@@@ -2679,7 -2650,7 +2683,7 @@@ retry_journal
                goto retry_grab;
        }
        /* In case writeback began while the page was unlocked */
 -      wait_on_page_writeback(page);
 +      wait_for_stable_page(page);
  
        ret = __block_write_begin(page, pos, len, ext4_da_get_block_prep);
        if (ret < 0) {
@@@ -3024,19 -2995,13 +3028,13 @@@ static int ext4_get_block_write_nolock(
  }
  
  static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
-                           ssize_t size, void *private, int ret,
-                           bool is_async)
+                           ssize_t size, void *private)
  {
-       struct inode *inode = file_inode(iocb->ki_filp);
          ext4_io_end_t *io_end = iocb->private;
  
        /* if not async direct IO just return */
-       if (!io_end) {
-               inode_dio_done(inode);
-               if (is_async)
-                       aio_complete(iocb, ret, 0);
+       if (!io_end)
                return;
-       }
  
        ext_debug("ext4_end_io_dio(): io_end 0x%p "
                  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
        iocb->private = NULL;
        io_end->offset = offset;
        io_end->size = size;
-       if (is_async) {
-               io_end->iocb = iocb;
-               io_end->result = ret;
-       }
-       ext4_put_io_end_defer(io_end);
+       ext4_put_io_end(io_end);
  }
  
  /*
@@@ -3135,7 -3096,6 +3129,6 @@@ static ssize_t ext4_ext_direct_IO(int r
                        ret = -ENOMEM;
                        goto retake_lock;
                }
-               io_end->flag |= EXT4_IO_END_DIRECT;
                /*
                 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
                 */
                if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
                        WARN_ON(iocb->private != io_end);
                        WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
-                       WARN_ON(io_end->iocb);
-                       /*
-                        * Generic code already did inode_dio_done() so we
-                        * have to clear EXT4_IO_END_DIRECT to not do it for
-                        * the second time.
-                        */
-                       io_end->flag = 0;
                        ext4_put_io_end(io_end);
                        iocb->private = NULL;
                }
@@@ -4599,9 -4552,7 +4585,9 @@@ int ext4_setattr(struct dentry *dentry
                ext4_journal_stop(handle);
        }
  
 -      if (attr->ia_valid & ATTR_SIZE) {
 +      if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 +              handle_t *handle;
 +              loff_t oldsize = inode->i_size;
  
                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                        if (attr->ia_size > sbi->s_bitmap_maxbytes)
                                return -EFBIG;
                }
 -      }
 -
 -      if (S_ISREG(inode->i_mode) &&
 -          attr->ia_valid & ATTR_SIZE &&
 -          (attr->ia_size < inode->i_size)) {
 -              handle_t *handle;
 -
 -              handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 -              if (IS_ERR(handle)) {
 -                      error = PTR_ERR(handle);
 -                      goto err_out;
 -              }
 -              if (ext4_handle_valid(handle)) {
 -                      error = ext4_orphan_add(handle, inode);
 -                      orphan = 1;
 -              }
 -              EXT4_I(inode)->i_disksize = attr->ia_size;
 -              rc = ext4_mark_inode_dirty(handle, inode);
 -              if (!error)
 -                      error = rc;
 -              ext4_journal_stop(handle);
 -
 -              if (ext4_should_order_data(inode)) {
 -                      error = ext4_begin_ordered_truncate(inode,
 +              if (S_ISREG(inode->i_mode) &&
 +                  (attr->ia_size < inode->i_size)) {
 +                      if (ext4_should_order_data(inode)) {
 +                              error = ext4_begin_ordered_truncate(inode,
                                                            attr->ia_size);
 -                      if (error) {
 -                              /* Do as much error cleanup as possible */
 -                              handle = ext4_journal_start(inode,
 -                                                          EXT4_HT_INODE, 3);
 -                              if (IS_ERR(handle)) {
 -                                      ext4_orphan_del(NULL, inode);
 +                              if (error)
                                        goto err_out;
 -                              }
 -                              ext4_orphan_del(handle, inode);
 -                              orphan = 0;
 -                              ext4_journal_stop(handle);
 +                      }
 +                      handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 +                      if (IS_ERR(handle)) {
 +                              error = PTR_ERR(handle);
                                goto err_out;
                        }
 -              }
 -      }
 -
 -      if (attr->ia_valid & ATTR_SIZE) {
 -              if (attr->ia_size != inode->i_size) {
 -                      loff_t oldsize = inode->i_size;
 -
 -                      i_size_write(inode, attr->ia_size);
 -                      /*
 -                       * Blocks are going to be removed from the inode. Wait
 -                       * for dio in flight.  Temporarily disable
 -                       * dioread_nolock to prevent livelock.
 -                       */
 -                      if (orphan) {
 -                              if (!ext4_should_journal_data(inode)) {
 -                                      ext4_inode_block_unlocked_dio(inode);
 -                                      inode_dio_wait(inode);
 -                                      ext4_inode_resume_unlocked_dio(inode);
 -                              } else
 -                                      ext4_wait_for_tail_page_commit(inode);
 +                      if (ext4_handle_valid(handle)) {
 +                              error = ext4_orphan_add(handle, inode);
 +                              orphan = 1;
                        }
 +                      down_write(&EXT4_I(inode)->i_data_sem);
 +                      EXT4_I(inode)->i_disksize = attr->ia_size;
 +                      rc = ext4_mark_inode_dirty(handle, inode);
 +                      if (!error)
 +                              error = rc;
                        /*
 -                       * Truncate pagecache after we've waited for commit
 -                       * in data=journal mode to make pages freeable.
 +                       * We have to update i_size under i_data_sem together
 +                       * with i_disksize to avoid races with writeback code
 +                       * running ext4_wb_update_i_disksize().
                         */
 -                      truncate_pagecache(inode, oldsize, inode->i_size);
 +                      if (!error)
 +                              i_size_write(inode, attr->ia_size);
 +                      up_write(&EXT4_I(inode)->i_data_sem);
 +                      ext4_journal_stop(handle);
 +                      if (error) {
 +                              ext4_orphan_del(NULL, inode);
 +                              goto err_out;
 +                      }
 +              } else
 +                      i_size_write(inode, attr->ia_size);
 +
 +              /*
 +               * Blocks are going to be removed from the inode. Wait
 +               * for dio in flight.  Temporarily disable
 +               * dioread_nolock to prevent livelock.
 +               */
 +              if (orphan) {
 +                      if (!ext4_should_journal_data(inode)) {
 +                              ext4_inode_block_unlocked_dio(inode);
 +                              inode_dio_wait(inode);
 +                              ext4_inode_resume_unlocked_dio(inode);
 +                      } else
 +                              ext4_wait_for_tail_page_commit(inode);
                }
 -              ext4_truncate(inode);
 +              /*
 +               * Truncate pagecache after we've waited for commit
 +               * in data=journal mode to make pages freeable.
 +               */
 +              truncate_pagecache(inode, oldsize, inode->i_size);
        }
 +      /*
 +       * We want to call ext4_truncate() even if attr->ia_size ==
 +       * inode->i_size for cases like truncation of fallocated space
 +       */
 +      if (attr->ia_valid & ATTR_SIZE)
 +              ext4_truncate(inode);
  
        if (!rc) {
                setattr_copy(inode, attr);
diff --combined fs/ext4/super.c
index 42337141e79fb90885678d9ea6a653d520437ffa,5db4f0df81741e981ab8abf5e2f1fc91c0b22f1f..049c8a8bdc0eab2e7dbdfdcf67a45fd43b31a95b
@@@ -762,9 -762,7 +762,7 @@@ static void ext4_put_super(struct super
        ext4_unregister_li_request(sb);
        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
  
-       flush_workqueue(sbi->unrsv_conversion_wq);
        flush_workqueue(sbi->rsv_conversion_wq);
-       destroy_workqueue(sbi->unrsv_conversion_wq);
        destroy_workqueue(sbi->rsv_conversion_wq);
  
        if (sbi->s_journal) {
@@@ -875,14 -873,12 +873,12 @@@ static struct inode *ext4_alloc_inode(s
  #endif
        ei->jinode = NULL;
        INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
-       INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
        spin_lock_init(&ei->i_completed_io_lock);
        ei->i_sync_tid = 0;
        ei->i_datasync_tid = 0;
        atomic_set(&ei->i_ioend_count, 0);
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
-       INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
  
        return &ei->vfs_inode;
  }
@@@ -1134,8 -1130,8 +1130,8 @@@ enum 
        Opt_nouid32, Opt_debug, Opt_removed,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
 -      Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
 -      Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
 +      Opt_commit, Opt_min_batch_time, Opt_max_batch_time, Opt_journal_dev,
 +      Opt_journal_path, Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@@ -1179,7 -1175,6 +1175,7 @@@ static const match_table_t tokens = 
        {Opt_min_batch_time, "min_batch_time=%u"},
        {Opt_max_batch_time, "max_batch_time=%u"},
        {Opt_journal_dev, "journal_dev=%u"},
 +      {Opt_journal_path, "journal_path=%s"},
        {Opt_journal_checksum, "journal_checksum"},
        {Opt_journal_async_commit, "journal_async_commit"},
        {Opt_abort, "abort"},
@@@ -1339,7 -1334,6 +1335,7 @@@ static int clear_qf_name(struct super_b
  #define MOPT_NO_EXT2  0x0100
  #define MOPT_NO_EXT3  0x0200
  #define MOPT_EXT4_ONLY        (MOPT_NO_EXT2 | MOPT_NO_EXT3)
 +#define MOPT_STRING   0x0400
  
  static const struct mount_opts {
        int     token;
        {Opt_resuid, 0, MOPT_GTE0},
        {Opt_resgid, 0, MOPT_GTE0},
        {Opt_journal_dev, 0, MOPT_GTE0},
 +      {Opt_journal_path, 0, MOPT_STRING},
        {Opt_journal_ioprio, 0, MOPT_GTE0},
        {Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
        {Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_NO_EXT2 | MOPT_DATAJ},
@@@ -1483,7 -1476,7 +1479,7 @@@ static int handle_mount_opt(struct supe
                return -1;
        }
  
 -      if (args->from && match_int(args, &arg))
 +      if (args->from && !(m->flags & MOPT_STRING) && match_int(args, &arg))
                return -1;
        if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
                return -1;
                        return -1;
                }
                *journal_devnum = arg;
 +      } else if (token == Opt_journal_path) {
 +              char *journal_path;
 +              struct inode *journal_inode;
 +              struct path path;
 +              int error;
 +
 +              if (is_remount) {
 +                      ext4_msg(sb, KERN_ERR,
 +                               "Cannot specify journal on remount");
 +                      return -1;
 +              }
 +              journal_path = match_strdup(&args[0]);
 +              if (!journal_path) {
 +                      ext4_msg(sb, KERN_ERR, "error: could not dup "
 +                              "journal device string");
 +                      return -1;
 +              }
 +
 +              error = kern_path(journal_path, LOOKUP_FOLLOW, &path);
 +              if (error) {
 +                      ext4_msg(sb, KERN_ERR, "error: could not find "
 +                              "journal device path: error %d", error);
 +                      kfree(journal_path);
 +                      return -1;
 +              }
 +
 +              journal_inode = path.dentry->d_inode;
 +              if (!S_ISBLK(journal_inode->i_mode)) {
 +                      ext4_msg(sb, KERN_ERR, "error: journal path %s "
 +                              "is not a block device", journal_path);
 +                      path_put(&path);
 +                      kfree(journal_path);
 +                      return -1;
 +              }
 +
 +              *journal_devnum = new_encode_dev(journal_inode->i_rdev);
 +              path_put(&path);
 +              kfree(journal_path);
        } else if (token == Opt_journal_ioprio) {
                if (arg > 7) {
                        ext4_msg(sb, KERN_ERR, "Invalid journal IO priority"
@@@ -3995,14 -3950,6 +3991,6 @@@ no_journal
                goto failed_mount4;
        }
  
-       EXT4_SB(sb)->unrsv_conversion_wq =
-               alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
-       if (!EXT4_SB(sb)->unrsv_conversion_wq) {
-               printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
-               ret = -ENOMEM;
-               goto failed_mount4;
-       }
        /*
         * The jbd2_journal_load will have done any necessary log recovery,
         * so we can safely mount the rest of the filesystem now.
@@@ -4156,8 -4103,6 +4144,6 @@@ failed_mount4
        ext4_msg(sb, KERN_ERR, "mount failed");
        if (EXT4_SB(sb)->rsv_conversion_wq)
                destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
-       if (EXT4_SB(sb)->unrsv_conversion_wq)
-               destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
  failed_mount_wq:
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
@@@ -4605,7 -4550,6 +4591,6 @@@ static int ext4_sync_fs(struct super_bl
  
        trace_ext4_sync_fs(sb, wait);
        flush_workqueue(sbi->rsv_conversion_wq);
-       flush_workqueue(sbi->unrsv_conversion_wq);
        /*
         * Writeback quota in non-journalled quota case - journalled quota has
         * no dirty dquots
@@@ -4641,7 -4585,6 +4626,6 @@@ static int ext4_sync_fs_nojournal(struc
  
        trace_ext4_sync_fs(sb, wait);
        flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
-       flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
        dquot_writeback_dquots(sb, -1);
        if (wait && test_opt(sb, BARRIER))
                ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
diff --combined fs/namei.c
index 2c30c84d4ea1da5e17381caeea6de5ae27e47a0e,b234e4ec0a7164615a2827f4f4a15952d2431aa4..f415c6683a837ac3cb1c5bf1d9600b6a3aec9d7e
@@@ -494,50 -494,6 +494,50 @@@ static inline void unlock_rcu_walk(void
        br_read_unlock(&vfsmount_lock);
  }
  
 +/*
 + * When we move over from the RCU domain to properly refcounted
 + * long-lived dentries, we need to check the sequence numbers
 + * we got before lookup very carefully.
 + *
 + * We cannot blindly increment a dentry refcount - even if it
 + * is not locked - if it is zero, because it may have gone
 + * through the final d_kill() logic already.
 + *
 + * So for a zero refcount, we need to get the spinlock (which is
 + * safe even for a dead dentry because the de-allocation is
 + * RCU-delayed), and check the sequence count under the lock.
 + *
 + * Once we have checked the sequence count, we know it is live,
 + * and since we hold the spinlock it cannot die from under us.
 + *
 + * In contrast, if the reference count wasn't zero, we can just
 + * increment the lockref without having to take the spinlock.
 + * Even if the sequence number ends up being stale, we haven't
 + * gone through the final dput() and killed the dentry yet.
 + */
 +static inline int d_rcu_to_refcount(struct dentry *dentry, seqcount_t *validate, unsigned seq)
 +{
 +      int gotref;
 +
 +      gotref = lockref_get_or_lock(&dentry->d_lockref);
 +
 +      /* Does the sequence number still match? */
 +      if (read_seqcount_retry(validate, seq)) {
 +              if (gotref)
 +                      dput(dentry);
 +              else
 +                      spin_unlock(&dentry->d_lock);
 +              return -ECHILD;
 +      }
 +
 +      /* Get the ref now, if we couldn't get it originally */
 +      if (!gotref) {
 +              dentry->d_lockref.count++;
 +              spin_unlock(&dentry->d_lock);
 +      }
 +      return 0;
 +}
 +
  /**
   * unlazy_walk - try to switch to ref-walk mode.
   * @nd: nameidata pathwalk data
@@@ -562,28 -518,29 +562,28 @@@ static int unlazy_walk(struct nameidat
                                nd->root.dentry != fs->root.dentry)
                        goto err_root;
        }
 -      spin_lock(&parent->d_lock);
 +
 +      /*
 +       * For a negative lookup, the lookup sequence point is the parents
 +       * sequence point, and it only needs to revalidate the parent dentry.
 +       *
 +       * For a positive lookup, we need to move both the parent and the
 +       * dentry from the RCU domain to be properly refcounted. And the
 +       * sequence number in the dentry validates *both* dentry counters,
 +       * since we checked the sequence number of the parent after we got
 +       * the child sequence number. So we know the parent must still
 +       * be valid if the child sequence number is still valid.
 +       */
        if (!dentry) {
 -              if (!__d_rcu_to_refcount(parent, nd->seq))
 -                      goto err_parent;
 +              if (d_rcu_to_refcount(parent, &parent->d_seq, nd->seq) < 0)
 +                      goto err_root;
                BUG_ON(nd->inode != parent->d_inode);
        } else {
 -              if (dentry->d_parent != parent)
 +              if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0)
 +                      goto err_root;
 +              if (d_rcu_to_refcount(parent, &dentry->d_seq, nd->seq) < 0)
                        goto err_parent;
 -              spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
 -              if (!__d_rcu_to_refcount(dentry, nd->seq))
 -                      goto err_child;
 -              /*
 -               * If the sequence check on the child dentry passed, then
 -               * the child has not been removed from its parent. This
 -               * means the parent dentry must be valid and able to take
 -               * a reference at this point.
 -               */
 -              BUG_ON(!IS_ROOT(dentry) && dentry->d_parent != parent);
 -              BUG_ON(!parent->d_count);
 -              parent->d_count++;
 -              spin_unlock(&dentry->d_lock);
        }
 -      spin_unlock(&parent->d_lock);
        if (want_root) {
                path_get(&nd->root);
                spin_unlock(&fs->lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
  
 -err_child:
 -      spin_unlock(&dentry->d_lock);
  err_parent:
 -      spin_unlock(&parent->d_lock);
 +      dput(dentry);
  err_root:
        if (want_root)
                spin_unlock(&fs->lock);
@@@ -626,11 -585,14 +626,11 @@@ static int complete_walk(struct nameida
                nd->flags &= ~LOOKUP_RCU;
                if (!(nd->flags & LOOKUP_ROOT))
                        nd->root.mnt = NULL;
 -              spin_lock(&dentry->d_lock);
 -              if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
 -                      spin_unlock(&dentry->d_lock);
 +
 +              if (d_rcu_to_refcount(dentry, &dentry->d_seq, nd->seq) < 0) {
                        unlock_rcu_walk();
                        return -ECHILD;
                }
 -              BUG_ON(nd->inode != dentry->d_inode);
 -              spin_unlock(&dentry->d_lock);
                mntget(nd->path.mnt);
                unlock_rcu_walk();
        }
@@@ -2222,6 -2184,188 +2222,188 @@@ user_path_parent(int dfd, const char __
        return s;
  }
  
+ /**
+  * umount_lookup_last - look up last component for umount
+  * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
+  * @path: pointer to container for result
+  *
+  * This is a special lookup_last function just for umount. In this case, we
+  * need to resolve the path without doing any revalidation.
+  *
+  * The nameidata should be the result of doing a LOOKUP_PARENT pathwalk. Since
+  * mountpoints are always pinned in the dcache, their ancestors are too. Thus,
+  * in almost all cases, this lookup will be served out of the dcache. The only
+  * cases where it won't are if nd->last refers to a symlink or the path is
+  * bogus and it doesn't exist.
+  *
+  * Returns:
+  * -error: if there was an error during lookup. This includes -ENOENT if the
+  *         lookup found a negative dentry. The nd->path reference will also be
+  *         put in this case.
+  *
+  * 0:      if we successfully resolved nd->path and found it to not to be a
+  *         symlink that needs to be followed. "path" will also be populated.
+  *         The nd->path reference will also be put.
+  *
+  * 1:      if we successfully resolved nd->last and found it to be a symlink
+  *         that needs to be followed. "path" will be populated with the path
+  *         to the link, and nd->path will *not* be put.
+  */
+ static int
+ umount_lookup_last(struct nameidata *nd, struct path *path)
+ {
+       int error = 0;
+       struct dentry *dentry;
+       struct dentry *dir = nd->path.dentry;
+       if (unlikely(nd->flags & LOOKUP_RCU)) {
+               WARN_ON_ONCE(1);
+               error = -ECHILD;
+               goto error_check;
+       }
+       nd->flags &= ~LOOKUP_PARENT;
+       if (unlikely(nd->last_type != LAST_NORM)) {
+               error = handle_dots(nd, nd->last_type);
+               if (!error)
+                       dentry = dget(nd->path.dentry);
+               goto error_check;
+       }
+       mutex_lock(&dir->d_inode->i_mutex);
+       dentry = d_lookup(dir, &nd->last);
+       if (!dentry) {
+               /*
+                * No cached dentry. Mounted dentries are pinned in the cache,
+                * so that means that this dentry is probably a symlink or the
+                * path doesn't actually point to a mounted dentry.
+                */
+               dentry = d_alloc(dir, &nd->last);
+               if (!dentry) {
+                       error = -ENOMEM;
+               } else {
+                       dentry = lookup_real(dir->d_inode, dentry, nd->flags);
+                       if (IS_ERR(dentry))
+                               error = PTR_ERR(dentry);
+               }
+       }
+       mutex_unlock(&dir->d_inode->i_mutex);
+ error_check:
+       if (!error) {
+               if (!dentry->d_inode) {
+                       error = -ENOENT;
+                       dput(dentry);
+               } else {
+                       path->dentry = dentry;
+                       path->mnt = mntget(nd->path.mnt);
+                       if (should_follow_link(dentry->d_inode,
+                                               nd->flags & LOOKUP_FOLLOW))
+                               return 1;
+                       follow_mount(path);
+               }
+       }
+       terminate_walk(nd);
+       return error;
+ }
+ /**
+  * path_umountat - look up a path to be umounted
+  * @dfd:      directory file descriptor to start walk from
+  * @name:     full pathname to walk
+  * @flags:    lookup flags
+  * @nd:               pathwalk nameidata
+  *
+  * Look up the given name, but don't attempt to revalidate the last component.
+  * Returns 0 and "path" will be valid on success; Retuns error otherwise.
+  */
+ static int
+ path_umountat(int dfd, const char *name, struct path *path, unsigned int flags)
+ {
+       struct file *base = NULL;
+       struct nameidata nd;
+       int err;
+       err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base);
+       if (unlikely(err))
+               return err;
+       current->total_link_count = 0;
+       err = link_path_walk(name, &nd);
+       if (err)
+               goto out;
+       /* If we're in rcuwalk, drop out of it to handle last component */
+       if (nd.flags & LOOKUP_RCU) {
+               err = unlazy_walk(&nd, NULL);
+               if (err) {
+                       terminate_walk(&nd);
+                       goto out;
+               }
+       }
+       err = umount_lookup_last(&nd, path);
+       while (err > 0) {
+               void *cookie;
+               struct path link = *path;
+               err = may_follow_link(&link, &nd);
+               if (unlikely(err))
+                       break;
+               nd.flags |= LOOKUP_PARENT;
+               err = follow_link(&link, &nd, &cookie);
+               if (err)
+                       break;
+               err = umount_lookup_last(&nd, path);
+               put_link(&nd, &link, cookie);
+       }
+ out:
+       if (base)
+               fput(base);
+       if (nd.root.mnt && !(nd.flags & LOOKUP_ROOT))
+               path_put(&nd.root);
+       return err;
+ }
+ /**
+  * user_path_umountat - lookup a path from userland in order to umount it
+  * @dfd:      directory file descriptor
+  * @name:     pathname from userland
+  * @flags:    lookup flags
+  * @path:     pointer to container to hold result
+  *
+  * A umount is a special case for path walking. We're not actually interested
+  * in the inode in this situation, and ESTALE errors can be a problem. We
+  * simply want track down the dentry and vfsmount attached at the mountpoint
+  * and avoid revalidating the last component.
+  *
+  * Returns 0 and populates "path" on success.
+  */
+ int
+ user_path_umountat(int dfd, const char __user *name, unsigned int flags,
+                       struct path *path)
+ {
+       struct filename *s = getname(name);
+       int error;
+       if (IS_ERR(s))
+               return PTR_ERR(s);
+       error = path_umountat(dfd, s->name, path, flags | LOOKUP_RCU);
+       if (unlikely(error == -ECHILD))
+               error = path_umountat(dfd, s->name, path, flags);
+       if (unlikely(error == -ESTALE))
+               error = path_umountat(dfd, s->name, path, flags | LOOKUP_REVAL);
+       if (likely(!error))
+               audit_inode(s, path->dentry, 0);
+       putname(s);
+       return error;
+ }
  /*
   * It's inline, so penalty for filesystems that don't use sticky bit is
   * minimal.
@@@ -3365,7 -3509,7 +3547,7 @@@ void dentry_unhash(struct dentry *dentr
  {
        shrink_dcache_parent(dentry);
        spin_lock(&dentry->d_lock);
 -      if (dentry->d_count == 1)
 +      if (dentry->d_lockref.count == 1)
                __d_drop(dentry);
        spin_unlock(&dentry->d_lock);
  }
@@@ -3709,15 -3853,11 +3891,15 @@@ SYSCALL_DEFINE5(linkat, int, olddfd, co
        if ((flags & ~(AT_SYMLINK_FOLLOW | AT_EMPTY_PATH)) != 0)
                return -EINVAL;
        /*
 -       * Using empty names is equivalent to using AT_SYMLINK_FOLLOW
 -       * on /proc/self/fd/<fd>.
 +       * To use null names we require CAP_DAC_READ_SEARCH
 +       * This ensures that not everyone will be able to create
 +       * handlink using the passed filedescriptor.
         */
 -      if (flags & AT_EMPTY_PATH)
 +      if (flags & AT_EMPTY_PATH) {
 +              if (!capable(CAP_DAC_READ_SEARCH))
 +                      return -ENOENT;
                how = LOOKUP_EMPTY;
 +      }
  
        if (flags & AT_SYMLINK_FOLLOW)
                how |= LOOKUP_FOLLOW;
diff --combined include/linux/fs.h
index e7893523f81f200cb1f038820493e45dd8b7e697,c9013876eb2904466769f8c924c9953c67ac15be..3b4cd8296e4165625b11584b7f134a5587b0f23b
@@@ -46,6 -46,7 +46,7 @@@ struct vfsmount
  struct cred;
  struct swap_info_struct;
  struct seq_file;
+ struct workqueue_struct;
  
  extern void __init inode_init(void);
  extern void __init inode_init_early(void);
@@@ -63,8 -64,7 +64,7 @@@ struct buffer_head
  typedef int (get_block_t)(struct inode *inode, sector_t iblock,
                        struct buffer_head *bh_result, int create);
  typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
-                       ssize_t bytes, void *private, int ret,
-                       bool is_async);
+                       ssize_t bytes, void *private);
  
  #define MAY_EXEC              0x00000001
  #define MAY_WRITE             0x00000002
@@@ -1328,6 -1328,9 +1328,9 @@@ struct super_block 
  
        /* Being remounted read-only */
        int s_readonly_remount;
+       /* AIO completions deferred from interrupt context */
+       struct workqueue_struct *s_dio_done_wq;
  };
  
  /* superblock cache pruning functions */
@@@ -1804,7 -1807,7 +1807,7 @@@ enum file_time_flags 
        S_VERSION = 8,
  };
  
- extern void touch_atime(struct path *);
+ extern void touch_atime(const struct path *);
  static inline void file_accessed(struct file *file)
  {
        if (!(file->f_flags & O_NOATIME))
@@@ -2503,7 -2506,6 +2506,7 @@@ extern void generic_fillattr(struct ino
  extern int vfs_getattr(struct path *, struct kstat *);
  void __inode_add_bytes(struct inode *inode, loff_t bytes);
  void inode_add_bytes(struct inode *inode, loff_t bytes);
 +void __inode_sub_bytes(struct inode *inode, loff_t bytes);
  void inode_sub_bytes(struct inode *inode, loff_t bytes);
  loff_t inode_get_bytes(struct inode *inode);
  void inode_set_bytes(struct inode *inode, loff_t bytes);
diff --combined kernel/module.c
index 9f5ddae72f4420623958f9ed9410630452d31715,c6756d1c6d73a5c5a3743174a907fbc802e2fbce..dc582749fa1386db25af23929db4fc20b6c7acce
@@@ -136,7 -136,6 +136,7 @@@ static int param_set_bool_enable_only(c
  }
  
  static const struct kernel_param_ops param_ops_bool_enable_only = {
 +      .flags = KERNEL_PARAM_FL_NOARG,
        .set = param_set_bool_enable_only,
        .get = param_get_bool,
  };
@@@ -604,7 -603,7 +604,7 @@@ static void setup_modinfo_##field(struc
  static ssize_t show_modinfo_##field(struct module_attribute *mattr,   \
                        struct module_kobject *mk, char *buffer)      \
  {                                                                     \
 -      return sprintf(buffer, "%s\n", mk->mod->field);               \
 +      return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field);  \
  }                                                                     \
  static int modinfo_##field##_exists(struct module *mod)               \
  {                                                                     \
@@@ -1612,14 -1611,6 +1612,14 @@@ static void module_remove_modinfo_attrs
        kfree(mod->modinfo_attrs);
  }
  
 +static void mod_kobject_put(struct module *mod)
 +{
 +      DECLARE_COMPLETION_ONSTACK(c);
 +      mod->mkobj.kobj_completion = &c;
 +      kobject_put(&mod->mkobj.kobj);
 +      wait_for_completion(&c);
 +}
 +
  static int mod_sysfs_init(struct module *mod)
  {
        int err;
        err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
                                   "%s", mod->name);
        if (err)
 -              kobject_put(&mod->mkobj.kobj);
 +              mod_kobject_put(mod);
  
        /* delay uevent until full sysfs population */
  out:
@@@ -1691,7 -1682,7 +1691,7 @@@ out_unreg_param
  out_unreg_holders:
        kobject_put(mod->holders_dir);
  out_unreg:
 -      kobject_put(&mod->mkobj.kobj);
 +      mod_kobject_put(mod);
  out:
        return err;
  }
@@@ -1700,7 -1691,7 +1700,7 @@@ static void mod_sysfs_fini(struct modul
  {
        remove_notes_attrs(mod);
        remove_sect_attrs(mod);
 -      kobject_put(&mod->mkobj.kobj);
 +      mod_kobject_put(mod);
  }
  
  #else /* !CONFIG_SYSFS */
@@@ -2549,21 -2540,20 +2549,20 @@@ static int copy_module_from_user(const 
  /* Sets info->hdr and info->len. */
  static int copy_module_from_fd(int fd, struct load_info *info)
  {
-       struct file *file;
+       struct fd f = fdget(fd);
        int err;
        struct kstat stat;
        loff_t pos;
        ssize_t bytes = 0;
  
-       file = fget(fd);
-       if (!file)
+       if (!f.file)
                return -ENOEXEC;
  
-       err = security_kernel_module_from_file(file);
+       err = security_kernel_module_from_file(f.file);
        if (err)
                goto out;
  
-       err = vfs_getattr(&file->f_path, &stat);
+       err = vfs_getattr(&f.file->f_path, &stat);
        if (err)
                goto out;
  
  
        pos = 0;
        while (pos < stat.size) {
-               bytes = kernel_read(file, pos, (char *)(info->hdr) + pos,
+               bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos,
                                    stat.size - pos);
                if (bytes < 0) {
                        vfree(info->hdr);
        info->len = pos;
  
  out:
-       fput(file);
+       fdput(f);
        return err;
  }