Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal
authorLinus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)
Pull third pile of signal handling patches from Al Viro:
 "This time it's mostly helpers and conversions to them; there's a lot
  of stuff remaining in the tree, but that'll either go in -rc2
  (isolated bug fixes, ideally via arch maintainers' trees) or will sit
  there until the next cycle."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal:
  x86: get rid of calling do_notify_resume() when returning to kernel mode
  blackfin: check __get_user() return value
  whack-a-mole with TIF_FREEZE
  FRV: Optimise the system call exit path in entry.S [ver #2]
  FRV: Shrink TIF_WORK_MASK [ver #2]
  FRV: Prevent syscall exit tracing and notify_resume at end of kernel exceptions
  new helper: signal_delivered()
  powerpc: get rid of restore_sigmask()
  most of set_current_blocked() callers want SIGKILL/SIGSTOP removed from set
  set_restore_sigmask() is never called without SIGPENDING (and never should be)
  TIF_RESTORE_SIGMASK can be set only when TIF_SIGPENDING is set
  don't call try_to_freeze() from do_signal()
  pull clearing RESTORE_SIGMASK into block_sigmask()
  sh64: failure to build sigframe != signal without handler
  openrisc: tracehook_signal_handler() is supposed to be called on success
  new helper: sigmask_to_save()
  new helper: restore_saved_sigmask()
  new helpers: {clear,test,test_and_clear}_restore_sigmask()
  HAVE_RESTORE_SIGMASK is defined on all architectures now

228 files changed:
Documentation/filesystems/Locking
Documentation/filesystems/vfs.txt
arch/alpha/include/asm/posix_types.h
arch/arm/include/asm/posix_types.h
arch/avr32/include/asm/posix_types.h
arch/blackfin/include/asm/posix_types.h
arch/cris/include/asm/posix_types.h
arch/frv/include/asm/posix_types.h
arch/h8300/include/asm/posix_types.h
arch/ia64/include/asm/posix_types.h
arch/ia64/kernel/perfmon.c
arch/ia64/kernel/sys_ia64.c
arch/m32r/include/asm/posix_types.h
arch/m68k/include/asm/posix_types.h
arch/mips/include/asm/posix_types.h
arch/mips/include/asm/stat.h
arch/mn10300/include/asm/posix_types.h
arch/parisc/include/asm/posix_types.h
arch/parisc/include/asm/stat.h
arch/powerpc/include/asm/posix_types.h
arch/powerpc/include/asm/stat.h
arch/s390/include/asm/posix_types.h
arch/sh/include/asm/posix_types_32.h
arch/sh/include/asm/posix_types_64.h
arch/sparc/include/asm/posix_types.h
arch/sparc/kernel/sys_sparc_64.c
arch/tile/include/asm/compat.h
arch/x86/include/asm/posix_types_32.h
drivers/base/soc.c
drivers/gpu/drm/i810/i810_dma.c
fs/9p/vfs_inode_dotl.c
fs/affs/affs.h
fs/aio.c
fs/attr.c
fs/binfmt_elf.c
fs/binfmt_flat.c
fs/btrfs/acl.c
fs/btrfs/backref.c
fs/btrfs/backref.h
fs/btrfs/btrfs_inode.h
fs/btrfs/check-integrity.c
fs/btrfs/ctree.c
fs/btrfs/ctree.h
fs/btrfs/delayed-inode.c
fs/btrfs/delayed-ref.c
fs/btrfs/delayed-ref.h
fs/btrfs/disk-io.c
fs/btrfs/disk-io.h
fs/btrfs/export.c
fs/btrfs/extent-tree.c
fs/btrfs/extent_io.c
fs/btrfs/extent_io.h
fs/btrfs/file.c
fs/btrfs/free-space-cache.c
fs/btrfs/inode.c
fs/btrfs/ioctl.c
fs/btrfs/ioctl.h
fs/btrfs/ordered-data.c
fs/btrfs/ordered-data.h
fs/btrfs/print-tree.c
fs/btrfs/reada.c
fs/btrfs/scrub.c
fs/btrfs/super.c
fs/btrfs/transaction.c
fs/btrfs/tree-log.c
fs/btrfs/ulist.c
fs/btrfs/ulist.h
fs/btrfs/volumes.c
fs/btrfs/volumes.h
fs/btrfs/xattr.c
fs/buffer.c
fs/ceph/export.c
fs/compat.c
fs/dcache.c
fs/ecryptfs/inode.c
fs/exec.c
fs/exportfs/expfs.c
fs/ext4/Kconfig
fs/ext4/balloc.c
fs/ext4/bitmap.c
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_extents.h
fs/ext4/ext4_jbd2.c
fs/ext4/ext4_jbd2.h
fs/ext4/extents.c
fs/ext4/file.c
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/ioctl.c
fs/ext4/mballoc.c
fs/ext4/mmp.c
fs/ext4/namei.c
fs/ext4/resize.c
fs/ext4/super.c
fs/ext4/xattr.c
fs/ext4/xattr.h
fs/fat/inode.c
fs/fcntl.c
fs/file_table.c
fs/fuse/file.c
fs/fuse/inode.c
fs/gfs2/export.c
fs/hpfs/alloc.c
fs/hpfs/anode.c
fs/hpfs/dir.c
fs/hpfs/dnode.c
fs/hpfs/ea.c
fs/hpfs/hpfs.h
fs/hpfs/hpfs_fn.h
fs/hpfs/inode.c
fs/hpfs/map.c
fs/hpfs/namei.c
fs/hpfs/super.c
fs/inode.c
fs/internal.h
fs/isofs/export.c
fs/jbd2/Kconfig
fs/jbd2/commit.c
fs/jbd2/journal.c
fs/jbd2/recovery.c
fs/jbd2/revoke.c
fs/jbd2/transaction.c
fs/jffs2/jffs2_fs_sb.h
fs/jffs2/os-linux.h
fs/jffs2/super.c
fs/jffs2/wbuf.c
fs/lockd/svc.c
fs/locks.c
fs/namei.c
fs/namespace.c
fs/ncpfs/file.c
fs/ncpfs/ncp_fs_sb.h
fs/nfs/callback.c
fs/nfs/dir.c
fs/nfs/file.c
fs/nfsd/auth.c
fs/nfsd/export.c
fs/nfsd/fault_inject.c
fs/nfsd/nfs4callback.c
fs/nfsd/nfs4idmap.c
fs/nfsd/nfs4recover.c
fs/nfsd/nfs4state.c
fs/nfsd/nfs4xdr.c
fs/nfsd/nfsctl.c
fs/nfsd/nfssvc.c
fs/nfsd/state.h
fs/nfsd/xdr4.h
fs/nilfs2/namei.c
fs/notify/fsnotify.c
fs/ntfs/file.c
fs/ocfs2/blockcheck.c
fs/ocfs2/dlm/dlmast.c
fs/ocfs2/dlm/dlmcommon.h
fs/ocfs2/dlm/dlmdomain.c
fs/ocfs2/export.c
fs/ocfs2/inode.c
fs/ocfs2/ioctl.c
fs/ocfs2/move_extents.c
fs/ocfs2/namei.c
fs/ocfs2/symlink.c
fs/ocfs2/symlink.h
fs/open.c
fs/pipe.c
fs/pnode.c
fs/proc_namespace.c
fs/readdir.c
fs/reiserfs/inode.c
fs/reiserfs/journal.c
fs/reiserfs/reiserfs.h
fs/reiserfs/resize.c
fs/reiserfs/super.c
fs/signalfd.c
fs/splice.c
fs/statfs.c
fs/sync.c
fs/ubifs/dir.c
fs/udf/namei.c
fs/utimes.c
fs/xattr.c
fs/xfs/kmem.c
fs/xfs/kmem.h
fs/xfs/xfs_export.c
fs/xfs/xfs_file.c
fs/xfs/xfs_log.c
fs/xfs/xfs_log_priv.h
fs/xfs/xfs_trans.c
fs/xfs/xfs_trans.h
include/asm-generic/posix_types.h
include/linux/errno.h
include/linux/exportfs.h
include/linux/fs.h
include/linux/fsnotify_backend.h
include/linux/jbd2.h
include/linux/jbd_common.h
include/linux/lglock.h
include/linux/mm.h
include/linux/security.h
include/linux/sunrpc/svc.h
include/linux/sunrpc/svcauth.h
include/linux/sunrpc/svcauth_gss.h
include/linux/types.h
ipc/shm.c
kernel/Makefile
kernel/lglock.c [new file with mode: 0644]
mm/cleancache.c
mm/filemap.c
mm/filemap_xip.c
mm/internal.h
mm/mmap.c
mm/mremap.c
mm/nommu.c
mm/shmem.c
mm/util.c
net/sched/sch_atm.c
net/sunrpc/auth_gss/gss_krb5_wrap.c
net/sunrpc/auth_gss/svcauth_gss.c
net/sunrpc/rpcb_clnt.c
net/sunrpc/svc.c
net/sunrpc/svc_xprt.c
net/sunrpc/svcauth_unix.c
security/apparmor/lsm.c
security/capability.c
security/commoncap.c
security/security.c
security/selinux/hooks.c
security/selinux/selinuxfs.c
security/smack/smack_lsm.c

index d449e632e6a09fca5ca0fedd6bdc2cf87eb44a96..8e2da1e06e3b2371eb82ef07105e63ad97d224b6 100644 (file)
@@ -61,6 +61,7 @@ ata *);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*removexattr) (struct dentry *, const char *);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
+       void (*update_time)(struct inode *, struct timespec *, int);
 
 locking rules:
        all may block
@@ -87,6 +88,8 @@ getxattr:     no
 listxattr:     no
 removexattr:   yes
 fiemap:                no
+update_time:   no
+
        Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
        cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
index ef19f91a0f12021f5c003f6084711e6b3b4897cc..efd23f4817044ac9d55932bd9476d309a02918dc 100644 (file)
@@ -363,6 +363,7 @@ struct inode_operations {
        ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
        ssize_t (*listxattr) (struct dentry *, char *, size_t);
        int (*removexattr) (struct dentry *, const char *);
+       void (*update_time)(struct inode *, struct timespec *, int);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -471,6 +472,9 @@ otherwise noted.
   removexattr: called by the VFS to remove an extended attribute from
        a file. This method is called by removexattr(2) system call.
 
+  update_time: called by the VFS to update a specific time or the i_version of
+       an inode.  If this is not defined the VFS will update the inode itself
+       and call mark_inode_dirty_sync.
 
 The Address Space Object
 ========================
index 24779fc95994efb5c4d69e4d507f3cba581570a4..5a8a48320efe9f5c577f9cc8363a4de2a6725559 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned int   __kernel_ino_t;
 #define __kernel_ino_t __kernel_ino_t
 
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned long  __kernel_sigset_t;      /* at least 32 bits */
 
 #include <asm-generic/posix_types.h>
index efdf99045d879e240b9bc41f1f0781efea6ac10f..d2de9cbbcd9bcaf6a9e5b76eefac1f8c8eb7b39d 100644 (file)
@@ -22,9 +22,6 @@
 typedef unsigned short         __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short         __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short         __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 74667bfc88cc7676e4c43332b40705908c747021..9ba9e749b3f34d7c2760d1d9784ff9ede9528ef3 100644 (file)
@@ -17,9 +17,6 @@
 typedef unsigned short  __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short  __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 41bc1875c4d7fd367bbbea5432b9332a2821557f..1bd3436db6a7b7d080bdf4bcb1a09db621fde1b0 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned int __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 234891c74e2bbe21d0aed4301a7552200829c101..ce4e517931514fb0f6d52a2ef1af35b39aa19ea0 100644 (file)
@@ -15,9 +15,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short  __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 3f34cb45fbb3fafd24edff901ca8b52d9941532d..fe512af74a5afbb57dbc1490214155cdbbeb220d 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index bc4c34efb1ad167ccafa90ee8796d605058572a0..91e62ba4c7b02e99cf73893ef04e6a8cf259025a 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 7323ab9467ebae726473512588d2c8f41d0ceb40..99ee1d6510cfc98a7dc66128fca841af022b133f 100644 (file)
@@ -1,9 +1,6 @@
 #ifndef _ASM_IA64_POSIX_TYPES_H
 #define _ASM_IA64_POSIX_TYPES_H
 
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned long  __kernel_sigset_t;      /* at least 32 bits */
 
 #include <asm-generic/posix_types.h>
index f00ba025375d5696d0070bfe640b6f26f554eebd..d7f558c1e7117bfff75a056d4fee9213c6a4b7fb 100644 (file)
@@ -604,12 +604,6 @@ pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
        spin_unlock(&(x)->ctx_lock);
 }
 
-static inline unsigned long 
-pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
-{
-       return get_unmapped_area(file, addr, len, pgoff, flags);
-}
-
 /* forward declaration */
 static const struct dentry_operations pfmfs_dentry_operations;
 
@@ -2333,8 +2327,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
        down_write(&task->mm->mmap_sem);
 
        /* find some free area in address space, must have mmap sem held */
-       vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
-       if (vma->vm_start == 0UL) {
+       vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
+       if (IS_ERR_VALUE(vma->vm_start)) {
                DPRINT(("Cannot find unmapped area for size %ld\n", size));
                up_write(&task->mm->mmap_sem);
                goto error;
index 609d50056a6c7bd9fba2d46b757960b3e8893388..d9439ef2f66187d9e864f91778b8a9022c41577c 100644 (file)
@@ -171,22 +171,9 @@ asmlinkage unsigned long
 ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
             unsigned long new_addr)
 {
-       extern unsigned long do_mremap (unsigned long addr,
-                                       unsigned long old_len,
-                                       unsigned long new_len,
-                                       unsigned long flags,
-                                       unsigned long new_addr);
-
-       down_write(&current->mm->mmap_sem);
-       {
-               addr = do_mremap(addr, old_len, new_len, flags, new_addr);
-       }
-       up_write(&current->mm->mmap_sem);
-
-       if (IS_ERR((void *) addr))
-               return addr;
-
-       force_successful_syscall_return();
+       addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
+       if (!IS_ERR((void *) addr))
+               force_successful_syscall_return();
        return addr;
 }
 
index 0195850e1f88698b7a6c29ffd8b807b9e38a5b53..236de26a409b3f9a3d85df67129d88a488d025d1 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 6373093be72bb049f37f071468c3b6c73d6daafe..cf4dbf70fdc73f116f95a83c698511fd7b5f4a62 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index e0308dcca1358f6f2db6161486299de11d61dde5..fa03ec3fbf897a4c3271d8a38025f383c82760cc 100644 (file)
  * assume GCC is being used.
  */
 
-#if (_MIPS_SZLONG == 64)
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-#endif
-
 typedef long           __kernel_daddr_t;
 #define __kernel_daddr_t __kernel_daddr_t
 
index 6e00f751ab6dc675b886d736e1f0c814136cfbd4..fe9a4c3ec5a1f2d9f557adf7c32348b0c892e374 100644 (file)
@@ -20,7 +20,7 @@ struct stat {
        long            st_pad1[3];             /* Reserved for network id */
        ino_t           st_ino;
        mode_t          st_mode;
-       nlink_t         st_nlink;
+       __u32           st_nlink;
        uid_t           st_uid;
        gid_t           st_gid;
        unsigned        st_rdev;
@@ -55,7 +55,7 @@ struct stat64 {
        unsigned long long      st_ino;
 
        mode_t          st_mode;
-       nlink_t         st_nlink;
+       __u32           st_nlink;
 
        uid_t           st_uid;
        gid_t           st_gid;
@@ -96,7 +96,7 @@ struct stat {
        unsigned long           st_ino;
 
        mode_t                  st_mode;
-       nlink_t                 st_nlink;
+       __u32                   st_nlink;
 
        uid_t                   st_uid;
        gid_t                   st_gid;
index ab506181ec3108ad98c2db77d73f01e2e2b9b134..d31eeea480cfdda8a4231351abbf61c76da08a5f 100644 (file)
@@ -20,9 +20,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 5212b0357daf15aaf454b751b0eb146fc47b34ac..b9344256f76b365db1c6a9a99b08b337f706970c 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short         __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short         __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short         __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index 9d5fbbc5c31f14df4791b1005e83ade0cd0505f4..d76fbda5d62c0437f5fb52c389e144597f054b12 100644 (file)
@@ -7,7 +7,7 @@ struct stat {
        unsigned int    st_dev;         /* dev_t is 32 bits on parisc */
        ino_t           st_ino;         /* 32 bits */
        mode_t          st_mode;        /* 16 bits */
-       nlink_t         st_nlink;       /* 16 bits */
+       unsigned short  st_nlink;       /* 16 bits */
        unsigned short  st_reserved1;   /* old st_uid */
        unsigned short  st_reserved2;   /* old st_gid */
        unsigned int    st_rdev;
@@ -42,7 +42,7 @@ struct hpux_stat64 {
        unsigned int    st_dev;         /* dev_t is 32 bits on parisc */
        ino_t           st_ino;         /* 32 bits */
        mode_t          st_mode;        /* 16 bits */
-       nlink_t         st_nlink;       /* 16 bits */
+       unsigned short  st_nlink;       /* 16 bits */
        unsigned short  st_reserved1;   /* old st_uid */
        unsigned short  st_reserved2;   /* old st_gid */
        unsigned int    st_rdev;
index f1393252bbdad837c97b794c8534328a9c912ee3..2958c5b97b2dd4100ac5907129b4736d94458cf7 100644 (file)
@@ -16,9 +16,6 @@ typedef int           __kernel_ssize_t;
 typedef long           __kernel_ptrdiff_t;
 #define __kernel_size_t __kernel_size_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef short          __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 #endif
index e4edc510b530cfed6420e96012a69dc424a34e79..10cfb558e0fd7d1a82df840dd67c3578be1cbe52 100644 (file)
@@ -30,11 +30,11 @@ struct stat {
        unsigned long   st_dev;
        ino_t           st_ino;
 #ifdef __powerpc64__
-       nlink_t         st_nlink;
+       unsigned short  st_nlink;
        mode_t          st_mode;
 #else
        mode_t          st_mode;
-       nlink_t         st_nlink;
+       unsigned short  st_nlink;
 #endif
        uid_t           st_uid;
        gid_t           st_gid;
index edf8527ff08d9bdf9e5f0468e24faebfdd372c34..7be104c0f19230e157d569efb9a18002dfa4709d 100644 (file)
@@ -24,7 +24,6 @@ typedef unsigned short        __kernel_old_dev_t;
 
 typedef unsigned long   __kernel_ino_t;
 typedef unsigned short  __kernel_mode_t;
-typedef unsigned short  __kernel_nlink_t;
 typedef unsigned short  __kernel_ipc_pid_t;
 typedef unsigned short  __kernel_uid_t;
 typedef unsigned short  __kernel_gid_t;
@@ -35,7 +34,6 @@ typedef int             __kernel_ptrdiff_t;
 
 typedef unsigned int    __kernel_ino_t;
 typedef unsigned int    __kernel_mode_t;
-typedef unsigned int    __kernel_nlink_t;
 typedef int             __kernel_ipc_pid_t;
 typedef unsigned int    __kernel_uid_t;
 typedef unsigned int    __kernel_gid_t;
@@ -47,7 +45,6 @@ typedef unsigned long   __kernel_sigset_t;      /* at least 32 bits */
 
 #define __kernel_ino_t  __kernel_ino_t
 #define __kernel_mode_t __kernel_mode_t
-#define __kernel_nlink_t __kernel_nlink_t
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 #define __kernel_uid_t __kernel_uid_t
 #define __kernel_gid_t __kernel_gid_t
index abda58467ece9e86ff1249029bcc7143281b2f04..ba0bdc423b072fa62f74fbc64e1bc5b683f2af7d 100644 (file)
@@ -3,8 +3,6 @@
 
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 typedef unsigned short __kernel_uid_t;
index fcda07b4a616be8196f105ce5d2faee8682c9af1..244f7e950e176b0cbdc907f70b4fdf88572b08f0 100644 (file)
@@ -3,8 +3,6 @@
 
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 typedef unsigned short __kernel_uid_t;
index 3070f25ae90a3e235eaaf2373949226ea83acfb6..156220ed99eb7dfbfe8696ea9f04da84fc00ff5f 100644 (file)
@@ -9,8 +9,6 @@
 
 #if defined(__sparc__) && defined(__arch64__)
 /* sparc 64 bit */
-typedef unsigned int           __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
 
 typedef unsigned short                __kernel_old_uid_t;
 typedef unsigned short         __kernel_old_gid_t;
@@ -38,9 +36,6 @@ typedef unsigned short         __kernel_gid_t;
 typedef unsigned short         __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef short                  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef long                   __kernel_daddr_t;
 #define __kernel_daddr_t __kernel_daddr_t
 
index 3ee51f189a55297b0babeb1f54d0b40af97de6f8..275f74fd6f6a3f16fdd4e5fae291af2a364075c0 100644 (file)
@@ -580,16 +580,9 @@ SYSCALL_DEFINE5(64_mremap, unsigned long, addr,    unsigned long, old_len,
                unsigned long, new_len, unsigned long, flags,
                unsigned long, new_addr)
 {
-       unsigned long ret = -EINVAL;
-
        if (test_thread_flag(TIF_32BIT))
-               goto out;
-
-       down_write(&current->mm->mmap_sem);
-       ret = do_mremap(addr, old_len, new_len, flags, new_addr);
-       up_write(&current->mm->mmap_sem);
-out:
-       return ret;       
+               return -EINVAL;
+       return sys_mremap(addr, old_len, new_len, flags, new_addr);
 }
 
 /* we come to here via sys_nis_syscall so it can setup the regs argument */
index 69adc08d36a52541b1be754dd7c809824b2d707a..6e74450ff0a110afc32901e273d74a77c80f8ca4 100644 (file)
@@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t;
 typedef __kernel_mode_t compat_mode_t;
 typedef __kernel_dev_t compat_dev_t;
 typedef __kernel_loff_t compat_loff_t;
-typedef __kernel_nlink_t compat_nlink_t;
 typedef __kernel_ipc_pid_t compat_ipc_pid_t;
 typedef __kernel_daddr_t compat_daddr_t;
 typedef __kernel_fsid_t        compat_fsid_t;
index 99f262e04b91b6d553fd65bd61957bd9cb5cbd15..8e525059e7d81c0a4cd46dfa2f62695daba80fee 100644 (file)
@@ -10,9 +10,6 @@
 typedef unsigned short __kernel_mode_t;
 #define __kernel_mode_t __kernel_mode_t
 
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
 typedef unsigned short __kernel_ipc_pid_t;
 #define __kernel_ipc_pid_t __kernel_ipc_pid_t
 
index ba29b2e73d48936ab9a93a028abd0b0d9cd29691..72b5e7280d14792e6d83f3a59e4d4793d20fe28c 100644 (file)
@@ -42,7 +42,7 @@ struct device *soc_device_to_device(struct soc_device *soc_dev)
        return &soc_dev->dev;
 }
 
-static mode_t soc_attribute_mode(struct kobject *kobj,
+static umode_t soc_attribute_mode(struct kobject *kobj,
                                  struct attribute *attr,
                                  int index)
 {
index f920fb5e42b63846e3d8b7b782b492e547e18eef..fa9439159ebd6bc85cdf4e27a307d9cde12dcbd6 100644 (file)
@@ -130,11 +130,10 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
                return -EINVAL;
 
        /* This is all entirely broken */
-       down_write(&current->mm->mmap_sem);
        old_fops = file_priv->filp->f_op;
        file_priv->filp->f_op = &i810_buffer_fops;
        dev_priv->mmap_buffer = buf;
-       buf_priv->virtual = (void *)do_mmap(file_priv->filp, 0, buf->total,
+       buf_priv->virtual = (void *)vm_mmap(file_priv->filp, 0, buf->total,
                                            PROT_READ | PROT_WRITE,
                                            MAP_SHARED, buf->bus_address);
        dev_priv->mmap_buffer = NULL;
@@ -145,7 +144,6 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
                retcode = PTR_ERR(buf_priv->virtual);
                buf_priv->virtual = NULL;
        }
-       up_write(&current->mm->mmap_sem);
 
        return retcode;
 }
index a1e6c990cd410efded55c826f03bc5db13839d75..e3dd2a1e2bfc18e47abae82bce7ee60238527c08 100644 (file)
@@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
        return current_fsgid();
 }
 
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
-       struct dentry *dentry;
-
-       spin_lock(&inode->i_lock);
-       /* Directory should have only one entry. */
-       BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-       dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-       spin_unlock(&inode->i_lock);
-       return dentry;
-}
-
 static int v9fs_test_inode_dotl(struct inode *inode, void *data)
 {
        struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -415,7 +397,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
        if (dir->i_mode & S_ISGID)
                omode |= S_ISGID;
 
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
@@ -793,7 +775,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                 dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
 
        v9ses = v9fs_inode2v9ses(dir);
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid))
                return PTR_ERR(dfid);
@@ -858,7 +840,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
                return -EINVAL;
 
        v9ses = v9fs_inode2v9ses(dir);
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
        dfid = v9fs_fid_lookup(dir_dentry);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
index 45a0ce45d7b46afa94b1290511bc1f91a9872137..1fceb320d2f22c16bc1a900cb27597d68977dbbd 100644 (file)
 #define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
 #define AFFS_BLOCK(sb, bh, blk)                (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
 
-#ifdef __LITTLE_ENDIAN
-#define BO_EXBITS      0x18UL
-#elif defined(__BIG_ENDIAN)
-#define BO_EXBITS      0x00UL
-#else
-#error Endianness must be known for affs to work.
-#endif
-
 #define AFFS_HEAD(bh)          ((struct affs_head *)(bh)->b_data)
 #define AFFS_TAIL(sb, bh)      ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
 #define AFFS_ROOT_HEAD(bh)     ((struct affs_root_head *)(bh)->b_data)
index 8c7c8b805372094cc5072b7cd49b5e771ab507c4..55c4c76560537f7fe72d6ff5f429eff666b86789 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -134,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx)
        info->mmap_size = nr_pages * PAGE_SIZE;
        dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
        down_write(&ctx->mm->mmap_sem);
-       info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 
-                                 PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
-                                 0);
+       info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
+                                       PROT_READ|PROT_WRITE,
+                                       MAP_ANONYMOUS|MAP_PRIVATE, 0);
        if (IS_ERR((void *)info->mmap_base)) {
                up_write(&ctx->mm->mmap_sem);
                info->mmap_size = 0;
index 584620e5dee52b5be4a456fb0572a5227a0ef534..0da90951d2776f827a905337938399ada79e8e69 100644 (file)
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -176,6 +176,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
                        return -EPERM;
        }
 
+       if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
+               if (attr->ia_size != inode->i_size)
+                       inode_inc_iversion(inode);
+       }
+
        if ((ia_valid & ATTR_MODE)) {
                umode_t amode = attr->ia_mode;
                /* Flag setting protected by i_mutex */
index e658dd134b95fb375b371a931e739baa95d249a8..1b52956afe33ab07889c3963ce2c41b32133483b 100644 (file)
@@ -329,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        if (!size)
                return addr;
 
-       down_write(&current->mm->mmap_sem);
        /*
        * total_size is the size of the ELF (interpreter) image.
        * The _first_ mmap needs to know the full size, otherwise
@@ -340,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
        */
        if (total_size) {
                total_size = ELF_PAGEALIGN(total_size);
-               map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+               map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
                if (!BAD_ADDR(map_addr))
-                       do_munmap(current->mm, map_addr+size, total_size-size);
+                       vm_munmap(map_addr+size, total_size-size);
        } else
-               map_addr = do_mmap(filep, addr, size, prot, type, off);
+               map_addr = vm_mmap(filep, addr, size, prot, type, off);
 
-       up_write(&current->mm->mmap_sem);
        return(map_addr);
 }
 
index 6b2daf99fab8bcd91d314f0abd951b8472a092d2..178cb70acc26de80ec3db21a8455e88b7fc0360b 100644 (file)
@@ -562,7 +562,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                                realdatastart = (unsigned long) -ENOMEM;
                        printk("Unable to allocate RAM for process data, errno %d\n",
                                        (int)-realdatastart);
-                       do_munmap(current->mm, textpos, text_len);
+                       vm_munmap(textpos, text_len);
                        ret = realdatastart;
                        goto err;
                }
@@ -586,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                if (IS_ERR_VALUE(result)) {
                        printk("Unable to read data+bss, errno %d\n", (int)-result);
-                       do_munmap(current->mm, textpos, text_len);
-                       do_munmap(current->mm, realdatastart, len);
+                       vm_munmap(textpos, text_len);
+                       vm_munmap(realdatastart, len);
                        ret = result;
                        goto err;
                }
@@ -654,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                }
                if (IS_ERR_VALUE(result)) {
                        printk("Unable to read code+data+bss, errno %d\n",(int)-result);
-                       do_munmap(current->mm, textpos, text_len + data_len + extra +
+                       vm_munmap(textpos, text_len + data_len + extra +
                                MAX_SHARED_LIBS * sizeof(unsigned long));
                        ret = result;
                        goto err;
index 89b156d85d63c9f29b66413e1558e85a758d0e12..761e2cd8fed16e6046951e50504b8bb9e7acd3e4 100644 (file)
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
                if (ret > 0) {
                        /* we need an acl */
                        ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
+               } else {
+                       cache_no_acl(inode);
                }
+       } else {
+               cache_no_acl(inode);
        }
 failed:
        posix_acl_release(acl);
index bcec06750232e6cc3de09c62648201547709222b..3f75895c919bcc3b80ae63ab1fca42dcf335f95b 100644 (file)
 #include "delayed-ref.h"
 #include "locking.h"
 
+struct extent_inode_elem {
+       u64 inum;
+       u64 offset;
+       struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+                               struct btrfs_file_extent_item *fi,
+                               u64 extent_item_pos,
+                               struct extent_inode_elem **eie)
+{
+       u64 data_offset;
+       u64 data_len;
+       struct extent_inode_elem *e;
+
+       data_offset = btrfs_file_extent_offset(eb, fi);
+       data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+       if (extent_item_pos < data_offset ||
+           extent_item_pos >= data_offset + data_len)
+               return 1;
+
+       e = kmalloc(sizeof(*e), GFP_NOFS);
+       if (!e)
+               return -ENOMEM;
+
+       e->next = *eie;
+       e->inum = key->objectid;
+       e->offset = key->offset + (extent_item_pos - data_offset);
+       *eie = e;
+
+       return 0;
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+                               u64 extent_item_pos,
+                               struct extent_inode_elem **eie)
+{
+       u64 disk_byte;
+       struct btrfs_key key;
+       struct btrfs_file_extent_item *fi;
+       int slot;
+       int nritems;
+       int extent_type;
+       int ret;
+
+       /*
+        * from the shared data ref, we only have the leaf but we need
+        * the key. thus, we must look into all items and see that we
+        * find one (some) with a reference to our extent item.
+        */
+       nritems = btrfs_header_nritems(eb);
+       for (slot = 0; slot < nritems; ++slot) {
+               btrfs_item_key_to_cpu(eb, &key, slot);
+               if (key.type != BTRFS_EXTENT_DATA_KEY)
+                       continue;
+               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+               extent_type = btrfs_file_extent_type(eb, fi);
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                       continue;
+               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+               disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+               if (disk_byte != wanted_disk_byte)
+                       continue;
+
+               ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
 /*
  * this structure records all encountered refs on the way up to the root
  */
 struct __prelim_ref {
        struct list_head list;
        u64 root_id;
-       struct btrfs_key key;
+       struct btrfs_key key_for_search;
        int level;
        int count;
+       struct extent_inode_elem *inode_list;
        u64 parent;
        u64 wanted_disk_byte;
 };
 
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ *   block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    -   |     -
+ *      key to resolve |    -   |     y    |    y   |     y
+ *  tree block logical |    -   |     -    |    -   |     -
+ *  root for resolving |    y   |     y    |    y   |     y
+ *
+ * - column 1:       we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    y   |     -
+ *      key to resolve |    -   |     -    |    -   |     y
+ *  tree block logical |    y   |     y    |    y   |     y
+ *  root for resolving |    -   |     y    |    y   |     y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2:    we take the first key from the block to find the parent
+ *                (see __add_missing_keys)
+ * - column 4:    we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
 static int __add_prelim_ref(struct list_head *head, u64 root_id,
-                           struct btrfs_key *key, int level, u64 parent,
-                           u64 wanted_disk_byte, int count)
+                           struct btrfs_key *key, int level,
+                           u64 parent, u64 wanted_disk_byte, int count)
 {
        struct __prelim_ref *ref;
 
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 
        ref->root_id = root_id;
        if (key)
-               ref->key = *key;
+               ref->key_for_search = *key;
        else
-               memset(&ref->key, 0, sizeof(ref->key));
+               memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
 
+       ref->inode_list = NULL;
        ref->level = level;
        ref->count = count;
        ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
 }
 
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
-                               struct ulist *parents,
-                               struct extent_buffer *eb, int level,
-                               u64 wanted_objectid, u64 wanted_disk_byte)
+                               struct ulist *parents, int level,
+                               struct btrfs_key *key, u64 wanted_disk_byte,
+                               const u64 *extent_item_pos)
 {
        int ret;
-       int slot;
+       int slot = path->slots[level];
+       struct extent_buffer *eb = path->nodes[level];
        struct btrfs_file_extent_item *fi;
-       struct btrfs_key key;
+       struct extent_inode_elem *eie = NULL;
        u64 disk_byte;
+       u64 wanted_objectid = key->objectid;
 
 add_parent:
-       ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+       if (level == 0 && extent_item_pos) {
+               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+               ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
+               if (ret < 0)
+                       return ret;
+       }
+       ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
        if (ret < 0)
                return ret;
 
@@ -89,6 +211,7 @@ add_parent:
         * repeat this until we don't find any additional EXTENT_DATA items.
         */
        while (1) {
+               eie = NULL;
                ret = btrfs_next_leaf(root, path);
                if (ret < 0)
                        return ret;
@@ -97,9 +220,9 @@ add_parent:
 
                eb = path->nodes[0];
                for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
-                       btrfs_item_key_to_cpu(eb, &key, slot);
-                       if (key.objectid != wanted_objectid ||
-                           key.type != BTRFS_EXTENT_DATA_KEY)
+                       btrfs_item_key_to_cpu(eb, key, slot);
+                       if (key->objectid != wanted_objectid ||
+                           key->type != BTRFS_EXTENT_DATA_KEY)
                                return 0;
                        fi = btrfs_item_ptr(eb, slot,
                                                struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
  */
 static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                                        int search_commit_root,
+                                       u64 time_seq,
                                        struct __prelim_ref *ref,
-                                       struct ulist *parents)
+                                       struct ulist *parents,
+                                       const u64 *extent_item_pos)
 {
        struct btrfs_path *path;
        struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                goto out;
 
        path->lowest_level = level;
-       ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+       ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
        pr_debug("search slot in root %llu (level %d, ref count %d) returned "
                 "%d for key (%llu %u %llu)\n",
                 (unsigned long long)ref->root_id, level, ref->count, ret,
-                (unsigned long long)ref->key.objectid, ref->key.type,
-                (unsigned long long)ref->key.offset);
+                (unsigned long long)ref->key_for_search.objectid,
+                ref->key_for_search.type,
+                (unsigned long long)ref->key_for_search.offset);
        if (ret < 0)
                goto out;
 
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
        }
 
-       /* the last two parameters will only be used for level == 0 */
-       ret = add_all_parents(root, path, parents, eb, level, key.objectid,
-                               ref->wanted_disk_byte);
+       ret = add_all_parents(root, path, parents, level, &key,
+                               ref->wanted_disk_byte, extent_item_pos);
 out:
        btrfs_free_path(path);
        return ret;
@@ -191,8 +316,9 @@ out:
  * resolve all indirect backrefs from the list
  */
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-                                  int search_commit_root,
-                                  struct list_head *head)
+                                  int search_commit_root, u64 time_seq,
+                                  struct list_head *head,
+                                  const u64 *extent_item_pos)
 {
        int err;
        int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
        struct __prelim_ref *new_ref;
        struct ulist *parents;
        struct ulist_node *node;
+       struct ulist_iterator uiter;
 
        parents = ulist_alloc(GFP_NOFS);
        if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                if (ref->count == 0)
                        continue;
                err = __resolve_indirect_ref(fs_info, search_commit_root,
-                                            ref, parents);
+                                            time_seq, ref, parents,
+                                            extent_item_pos);
                if (err) {
                        if (ret == 0)
                                ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                }
 
                /* we put the first parent into the ref at hand */
-               node = ulist_next(parents, NULL);
+               ULIST_ITER_INIT(&uiter);
+               node = ulist_next(parents, &uiter);
                ref->parent = node ? node->val : 0;
+               ref->inode_list =
+                       node ? (struct extent_inode_elem *)node->aux : 0;
 
                /* additional parents require new refs being added here */
-               while ((node = ulist_next(parents, node))) {
+               while ((node = ulist_next(parents, &uiter))) {
                        new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
                        if (!new_ref) {
                                ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                        }
                        memcpy(new_ref, ref, sizeof(*ref));
                        new_ref->parent = node->val;
+                       new_ref->inode_list =
+                                       (struct extent_inode_elem *)node->aux;
                        list_add(&new_ref->list, &ref->list);
                }
                ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
        return ret;
 }
 
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+                                    struct __prelim_ref *ref2)
+{
+       if (ref1->level != ref2->level)
+               return 0;
+       if (ref1->root_id != ref2->root_id)
+               return 0;
+       if (ref1->key_for_search.type != ref2->key_for_search.type)
+               return 0;
+       if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+               return 0;
+       if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+               return 0;
+       if (ref1->parent != ref2->parent)
+               return 0;
+
+       return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+                             struct list_head *head)
+{
+       struct list_head *pos;
+       struct extent_buffer *eb;
+
+       list_for_each(pos, head) {
+               struct __prelim_ref *ref;
+               ref = list_entry(pos, struct __prelim_ref, list);
+
+               if (ref->parent)
+                       continue;
+               if (ref->key_for_search.type)
+                       continue;
+               BUG_ON(!ref->wanted_disk_byte);
+               eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+                                    fs_info->tree_root->leafsize, 0);
+               BUG_ON(!eb);
+               btrfs_tree_read_lock(eb);
+               if (btrfs_header_level(eb) == 0)
+                       btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+               else
+                       btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+               btrfs_tree_read_unlock(eb);
+               free_extent_buffer(eb);
+       }
+       return 0;
+}
+
 /*
  * merge two lists of backrefs and adjust counts accordingly
  *
  * mode = 1: merge identical keys, if key is set
+ *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ *           additionally, we could even add a key range for the blocks we
+ *           looked into to merge even more (-> replace unresolved refs by those
+ *           having a parent).
  * mode = 2: merge identical parents
  */
 static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
 
                ref1 = list_entry(pos1, struct __prelim_ref, list);
 
-               if (mode == 1 && ref1->key.type == 0)
-                       continue;
                for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
                     pos2 = n2, n2 = pos2->next) {
                        struct __prelim_ref *ref2;
+                       struct __prelim_ref *xchg;
 
                        ref2 = list_entry(pos2, struct __prelim_ref, list);
 
                        if (mode == 1) {
-                               if (memcmp(&ref1->key, &ref2->key,
-                                          sizeof(ref1->key)) ||
-                                   ref1->level != ref2->level ||
-                                   ref1->root_id != ref2->root_id)
+                               if (!ref_for_same_block(ref1, ref2))
                                        continue;
+                               if (!ref1->parent && ref2->parent) {
+                                       xchg = ref1;
+                                       ref1 = ref2;
+                                       ref2 = xchg;
+                               }
                                ref1->count += ref2->count;
                        } else {
                                if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
  * smaller or equal that seq to the list
  */
 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-                             struct btrfs_key *info_key,
                              struct list_head *prefs)
 {
        struct btrfs_delayed_extent_op *extent_op = head->extent_op;
        struct rb_node *n = &head->node.rb_node;
+       struct btrfs_key key;
+       struct btrfs_key op_key = {0};
        int sgn;
        int ret = 0;
 
        if (extent_op && extent_op->update_key)
-               btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+               btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
 
        while ((n = rb_prev(n))) {
                struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                        struct btrfs_delayed_tree_ref *ref;
 
                        ref = btrfs_delayed_node_to_tree_ref(node);
-                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                       ret = __add_prelim_ref(prefs, ref->root, &op_key,
                                               ref->level + 1, 0, node->bytenr,
                                               node->ref_mod * sgn);
                        break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                        struct btrfs_delayed_tree_ref *ref;
 
                        ref = btrfs_delayed_node_to_tree_ref(node);
-                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                       ret = __add_prelim_ref(prefs, ref->root, NULL,
                                               ref->level + 1, ref->parent,
                                               node->bytenr,
                                               node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                }
                case BTRFS_EXTENT_DATA_REF_KEY: {
                        struct btrfs_delayed_data_ref *ref;
-                       struct btrfs_key key;
-
                        ref = btrfs_delayed_node_to_data_ref(node);
 
                        key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                }
                case BTRFS_SHARED_DATA_REF_KEY: {
                        struct btrfs_delayed_data_ref *ref;
-                       struct btrfs_key key;
 
                        ref = btrfs_delayed_node_to_data_ref(node);
 
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
  */
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                             struct btrfs_path *path, u64 bytenr,
-                            struct btrfs_key *info_key, int *info_level,
-                            struct list_head *prefs)
+                            int *info_level, struct list_head *prefs)
 {
        int ret = 0;
        int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
         * enumerate all inline refs
         */
        leaf = path->nodes[0];
-       slot = path->slots[0] - 1;
+       slot = path->slots[0];
 
        item_size = btrfs_item_size_nr(leaf, slot);
        BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                struct btrfs_tree_block_info *info;
-               struct btrfs_disk_key disk_key;
 
                info = (struct btrfs_tree_block_info *)ptr;
                *info_level = btrfs_tree_block_level(leaf, info);
-               btrfs_tree_block_key(leaf, info, &disk_key);
-               btrfs_disk_key_to_cpu(info_key, &disk_key);
                ptr += sizeof(struct btrfs_tree_block_info);
                BUG_ON(ptr > end);
        } else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 
                switch (type) {
                case BTRFS_SHARED_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, 0, info_key,
+                       ret = __add_prelim_ref(prefs, 0, NULL,
                                                *info_level + 1, offset,
                                                bytenr, 1);
                        break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                        break;
                }
                case BTRFS_TREE_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, offset, info_key,
-                                              *info_level + 1, 0, bytenr, 1);
+                       ret = __add_prelim_ref(prefs, offset, NULL,
+                                              *info_level + 1, 0,
+                                              bytenr, 1);
                        break;
                case BTRFS_EXTENT_DATA_REF_KEY: {
                        struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                        key.type = BTRFS_EXTENT_DATA_KEY;
                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
                        root = btrfs_extent_data_ref_root(leaf, dref);
-                       ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
-                                               count);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+                                              bytenr, count);
                        break;
                }
                default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
  */
 static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                            struct btrfs_path *path, u64 bytenr,
-                           struct btrfs_key *info_key, int info_level,
-                           struct list_head *prefs)
+                           int info_level, struct list_head *prefs)
 {
        struct btrfs_root *extent_root = fs_info->extent_root;
        int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 
                switch (key.type) {
                case BTRFS_SHARED_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, 0, info_key,
+                       ret = __add_prelim_ref(prefs, 0, NULL,
                                                info_level + 1, key.offset,
                                                bytenr, 1);
                        break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                        break;
                }
                case BTRFS_TREE_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, key.offset, info_key,
-                                               info_level + 1, 0, bytenr, 1);
+                       ret = __add_prelim_ref(prefs, key.offset, NULL,
+                                              info_level + 1, 0,
+                                              bytenr, 1);
                        break;
                case BTRFS_EXTENT_DATA_REF_KEY: {
                        struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                        key.offset = btrfs_extent_data_ref_offset(leaf, dref);
                        root = btrfs_extent_data_ref_root(leaf, dref);
                        ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-                                               bytenr, count);
+                                              bytenr, count);
                        break;
                }
                default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info, u64 bytenr,
-                            u64 seq, struct ulist *refs, struct ulist *roots)
+                            u64 delayed_ref_seq, u64 time_seq,
+                            struct ulist *refs, struct ulist *roots,
+                            const u64 *extent_item_pos)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
-       struct btrfs_key info_key = { 0 };
        struct btrfs_delayed_ref_root *delayed_refs = NULL;
        struct btrfs_delayed_ref_head *head;
        int info_level = 0;
@@ -645,7 +830,7 @@ again:
                                btrfs_put_delayed_ref(&head->node);
                                goto again;
                        }
-                       ret = __add_delayed_refs(head, seq, &info_key,
+                       ret = __add_delayed_refs(head, delayed_ref_seq,
                                                 &prefs_delayed);
                        if (ret) {
                                spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
                struct extent_buffer *leaf;
                int slot;
 
+               path->slots[0]--;
                leaf = path->nodes[0];
-               slot = path->slots[0] - 1;
+               slot = path->slots[0];
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid == bytenr &&
                    key.type == BTRFS_EXTENT_ITEM_KEY) {
                        ret = __add_inline_refs(fs_info, path, bytenr,
-                                               &info_key, &info_level, &prefs);
+                                               &info_level, &prefs);
                        if (ret)
                                goto out;
-                       ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+                       ret = __add_keyed_refs(fs_info, path, bytenr,
                                               info_level, &prefs);
                        if (ret)
                                goto out;
@@ -676,21 +862,18 @@ again:
        }
        btrfs_release_path(path);
 
-       /*
-        * when adding the delayed refs above, the info_key might not have
-        * been known yet. Go over the list and replace the missing keys
-        */
-       list_for_each_entry(ref, &prefs_delayed, list) {
-               if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
-                       memcpy(&ref->key, &info_key, sizeof(ref->key));
-       }
        list_splice_init(&prefs_delayed, &prefs);
 
+       ret = __add_missing_keys(fs_info, &prefs);
+       if (ret)
+               goto out;
+
        ret = __merge_refs(&prefs, 1);
        if (ret)
                goto out;
 
-       ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
+       ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
+                                     &prefs, extent_item_pos);
        if (ret)
                goto out;
 
@@ -709,7 +892,33 @@ again:
                        BUG_ON(ret < 0);
                }
                if (ref->count && ref->parent) {
-                       ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+                       struct extent_inode_elem *eie = NULL;
+                       if (extent_item_pos && !ref->inode_list) {
+                               u32 bsz;
+                               struct extent_buffer *eb;
+                               bsz = btrfs_level_size(fs_info->extent_root,
+                                                       info_level);
+                               eb = read_tree_block(fs_info->extent_root,
+                                                          ref->parent, bsz, 0);
+                               BUG_ON(!eb);
+                               ret = find_extent_in_eb(eb, bytenr,
+                                                       *extent_item_pos, &eie);
+                               ref->inode_list = eie;
+                               free_extent_buffer(eb);
+                       }
+                       ret = ulist_add_merge(refs, ref->parent,
+                                             (unsigned long)ref->inode_list,
+                                             (unsigned long *)&eie, GFP_NOFS);
+                       if (!ret && extent_item_pos) {
+                               /*
+                                * we've recorded that parent, so we must extend
+                                * its inode list here
+                                */
+                               BUG_ON(!eie);
+                               while (eie->next)
+                                       eie = eie->next;
+                               eie->next = ref->inode_list;
+                       }
                        BUG_ON(ret < 0);
                }
                kfree(ref);
@@ -734,6 +943,28 @@ out:
        return ret;
 }
 
+static void free_leaf_list(struct ulist *blocks)
+{
+       struct ulist_node *node = NULL;
+       struct extent_inode_elem *eie;
+       struct extent_inode_elem *eie_next;
+       struct ulist_iterator uiter;
+
+       ULIST_ITER_INIT(&uiter);
+       while ((node = ulist_next(blocks, &uiter))) {
+               if (!node->aux)
+                       continue;
+               eie = (struct extent_inode_elem *)node->aux;
+               for (; eie; eie = eie_next) {
+                       eie_next = eie->next;
+                       kfree(eie);
+               }
+               node->aux = 0;
+       }
+
+       ulist_free(blocks);
+}
+
 /*
  * Finds all leafs with a reference to the specified combination of bytenr and
  * offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
  */
 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **leafs)
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **leafs,
+                               const u64 *extent_item_pos)
 {
        struct ulist *tmp;
        int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
-       ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+       ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+                               time_seq, *leafs, tmp, extent_item_pos);
        ulist_free(tmp);
 
        if (ret < 0 && ret != -ENOENT) {
-               ulist_free(*leafs);
+               free_leaf_list(*leafs);
                return ret;
        }
 
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
  */
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **roots)
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **roots)
 {
        struct ulist *tmp;
        struct ulist_node *node = NULL;
+       struct ulist_iterator uiter;
        int ret;
 
        tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        }
 
+       ULIST_ITER_INIT(&uiter);
        while (1) {
-               ret = find_parent_nodes(trans, fs_info, bytenr, seq,
-                                       tmp, *roots);
+               ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+                                       time_seq, tmp, *roots, NULL);
                if (ret < 0 && ret != -ENOENT) {
                        ulist_free(tmp);
                        ulist_free(*roots);
                        return ret;
                }
-               node = ulist_next(tmp, node);
+               node = ulist_next(tmp, &uiter);
                if (!node)
                        break;
                bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
        return 0;
 }
 
-static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
-                               u64 orig_extent_item_objectid,
-                               u64 extent_item_pos, u64 root,
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+                               u64 root, u64 extent_item_objectid,
                                iterate_extent_inodes_t *iterate, void *ctx)
 {
-       u64 disk_byte;
-       struct btrfs_key key;
-       struct btrfs_file_extent_item *fi;
-       struct extent_buffer *eb;
-       int slot;
-       int nritems;
+       struct extent_inode_elem *eie;
        int ret = 0;
-       int extent_type;
-       u64 data_offset;
-       u64 data_len;
-
-       eb = read_tree_block(fs_info->tree_root, logical,
-                               fs_info->tree_root->leafsize, 0);
-       if (!eb)
-               return -EIO;
-
-       /*
-        * from the shared data ref, we only have the leaf but we need
-        * the key. thus, we must look into all items and see that we
-        * find one (some) with a reference to our extent item.
-        */
-       nritems = btrfs_header_nritems(eb);
-       for (slot = 0; slot < nritems; ++slot) {
-               btrfs_item_key_to_cpu(eb, &key, slot);
-               if (key.type != BTRFS_EXTENT_DATA_KEY)
-                       continue;
-               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-               extent_type = btrfs_file_extent_type(eb, fi);
-               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
-                       continue;
-               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
-               disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-               if (disk_byte != orig_extent_item_objectid)
-                       continue;
-
-               data_offset = btrfs_file_extent_offset(eb, fi);
-               data_len = btrfs_file_extent_num_bytes(eb, fi);
-
-               if (extent_item_pos < data_offset ||
-                   extent_item_pos >= data_offset + data_len)
-                       continue;
 
+       for (eie = inode_list; eie; eie = eie->next) {
                pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
-                               "root %llu\n", orig_extent_item_objectid,
-                               key.objectid, key.offset, root);
-               ret = iterate(key.objectid,
-                               key.offset + (extent_item_pos - data_offset),
-                               root, ctx);
+                        "root %llu\n", extent_item_objectid,
+                        eie->inum, eie->offset, root);
+               ret = iterate(eie->inum, eie->offset, root, ctx);
                if (ret) {
-                       pr_debug("stopping iteration because ret=%d\n", ret);
+                       pr_debug("stopping iteration for %llu due to ret=%d\n",
+                                extent_item_objectid, ret);
                        break;
                }
        }
 
-       free_extent_buffer(eb);
-
        return ret;
 }
 
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
        struct ulist *roots = NULL;
        struct ulist_node *ref_node = NULL;
        struct ulist_node *root_node = NULL;
-       struct seq_list seq_elem;
+       struct seq_list seq_elem = {};
+       struct seq_list tree_mod_seq_elem = {};
+       struct ulist_iterator ref_uiter;
+       struct ulist_iterator root_uiter;
        struct btrfs_delayed_ref_root *delayed_refs = NULL;
 
        pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                spin_lock(&delayed_refs->lock);
                btrfs_get_delayed_seq(delayed_refs, &seq_elem);
                spin_unlock(&delayed_refs->lock);
+               btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
        }
 
        ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-                                  extent_item_pos, seq_elem.seq,
-                                  &refs);
-
+                                  seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+                                  &extent_item_pos);
        if (ret)
                goto out;
 
-       while (!ret && (ref_node = ulist_next(refs, ref_node))) {
-               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
-                                               seq_elem.seq, &roots);
+       ULIST_ITER_INIT(&ref_uiter);
+       while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+                                               seq_elem.seq,
+                                               tree_mod_seq_elem.seq, &roots);
                if (ret)
                        break;
-               while (!ret && (root_node = ulist_next(roots, root_node))) {
-                       pr_debug("root %llu references leaf %llu\n",
-                                       root_node->val, ref_node->val);
-                       ret = iterate_leaf_refs(fs_info, ref_node->val,
-                                               extent_item_objectid,
-                                               extent_item_pos, root_node->val,
-                                               iterate, ctx);
+               ULIST_ITER_INIT(&root_uiter);
+               while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+                       pr_debug("root %llu references leaf %llu, data list "
+                                "%#lx\n", root_node->val, ref_node->val,
+                                ref_node->aux);
+                       ret = iterate_leaf_refs(
+                               (struct extent_inode_elem *)ref_node->aux,
+                               root_node->val, extent_item_objectid,
+                               iterate, ctx);
                }
+               ulist_free(roots);
+               roots = NULL;
        }
 
-       ulist_free(refs);
+       free_leaf_list(refs);
        ulist_free(roots);
 out:
        if (!search_commit_root) {
+               btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
                btrfs_put_delayed_seq(delayed_refs, &seq_elem);
                btrfs_end_transaction(trans, fs_info->extent_root);
        }
index 57ea2e959e4dcfaba89e4ee0b833f5744c3639d3..c18d8ac7b795da487c4a526979954e91cbddf52b 100644 (file)
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **roots);
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **roots);
 
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
index 9b9b15fd5204347c5ef2931fb186af679cb0d369..e616f8872e69bb0cf3b9a3f369ba49eeca57f2de 100644 (file)
 #include "ordered-data.h"
 #include "delayed-inode.h"
 
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero.  When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE         0
+#define BTRFS_INODE_ORPHAN_META_RESERVED       1
+#define BTRFS_INODE_DUMMY                      2
+#define BTRFS_INODE_IN_DEFRAG                  3
+#define BTRFS_INODE_DELALLOC_META_RESERVED     4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM            5
+
 /* in memory btrfs inode */
 struct btrfs_inode {
        /* which subvolume this inode belongs to */
@@ -57,9 +71,6 @@ struct btrfs_inode {
        /* used to order data wrt metadata */
        struct btrfs_ordered_inode_tree ordered_tree;
 
-       /* for keeping track of orphaned inodes */
-       struct list_head i_orphan;
-
        /* list of all the delalloc inodes in the FS.  There are times we need
         * to write all the delalloc pages to disk, and this list is used
         * to walk them all.
@@ -78,14 +89,13 @@ struct btrfs_inode {
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
 
+       unsigned long runtime_flags;
+
        /* full 64 bit generation number, struct vfs_inode doesn't have a big
         * enough field for this.
         */
        u64 generation;
 
-       /* sequence number for NFS changes */
-       u64 sequence;
-
        /*
         * transid of the trans_handle that last modified this inode
         */
@@ -144,23 +154,10 @@ struct btrfs_inode {
        unsigned outstanding_extents;
        unsigned reserved_extents;
 
-       /*
-        * ordered_data_close is set by truncate when a file that used
-        * to have good data has been truncated to zero.  When it is set
-        * the btrfs file release call will add this inode to the
-        * ordered operations list so that we make sure to flush out any
-        * new data the application may have written before commit.
-        */
-       unsigned ordered_data_close:1;
-       unsigned orphan_meta_reserved:1;
-       unsigned dummy_inode:1;
-       unsigned in_defrag:1;
-       unsigned delalloc_meta_reserved:1;
-
        /*
         * always compress this one file
         */
-       unsigned force_compress:4;
+       unsigned force_compress;
 
        struct btrfs_delayed_node *delayed_node;
 
@@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
        return false;
 }
 
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret = 0;
+
+       mutex_lock(&root->log_mutex);
+       if (BTRFS_I(inode)->logged_trans == generation &&
+           BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+               ret = 1;
+       mutex_unlock(&root->log_mutex);
+       return ret;
+}
+
 #endif
index c053e90f2006f580ed4f8a4440fb520639c3edd6..9cebb1fd6a3cc59919c7c990d3016caee52b5849 100644 (file)
 #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
 #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)   /* in characters,
                                                         * excluding " [...]" */
-#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
-
 #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
 
 /*
@@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx {
        u64 dev_bytenr;         /* physical bytenr on device */
        u32 len;
        struct btrfsic_dev_state *dev;
-       char *data;
-       struct buffer_head *bh; /* do not use if set to NULL */
+       char **datav;
+       struct page **pagev;
+       void *mem_to_free;
 };
 
 /* This structure is used to implement recursion without occupying
@@ -243,6 +242,8 @@ struct btrfsic_state {
        struct btrfs_root *root;
        u64 max_superblock_generation;
        struct btrfsic_block *latest_superblock;
+       u32 metablock_size;
+       u32 datablock_size;
 };
 
 static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 static int btrfsic_process_metablock(struct btrfsic_state *state,
                                     struct btrfsic_block *block,
                                     struct btrfsic_block_data_ctx *block_ctx,
-                                    struct btrfs_header *hdr,
                                     int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+       struct btrfsic_block_data_ctx *block_ctx,
+       void *dst, u32 offset, size_t len);
 static int btrfsic_create_link_to_next_block(
                struct btrfsic_state *state,
                struct btrfsic_block *block,
@@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
 static int btrfsic_read_block(struct btrfsic_state *state,
                              struct btrfsic_block_data_ctx *block_ctx);
 static void btrfsic_dump_database(struct btrfsic_state *state);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-                                    const u8 *data, unsigned int size);
+                                    char **datav, unsigned int num_pages);
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-                                         u64 dev_bytenr, u8 *mapped_data,
-                                         unsigned int len, struct bio *bio,
-                                         int *bio_is_patched,
+                                         u64 dev_bytenr, char **mapped_datav,
+                                         unsigned int num_pages,
+                                         struct bio *bio, int *bio_is_patched,
                                          struct buffer_head *bh,
                                          int submit_bio_bh_rw);
 static int btrfsic_process_written_superblock(
@@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
 static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                                           u64 bytenr,
                                           struct btrfsic_dev_state *dev_state,
-                                          u64 dev_bytenr, char *data);
+                                          u64 dev_bytenr);
 
 static struct mutex btrfsic_mutex;
 static int btrfsic_is_initialized;
@@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
        int pass;
 
        BUG_ON(NULL == state);
-       selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+       selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
        if (NULL == selected_super) {
                printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
                return -1;
@@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                        struct btrfsic_block *next_block;
                        struct btrfsic_block_data_ctx tmp_next_block_ctx;
                        struct btrfsic_block_link *l;
-                       struct btrfs_header *hdr;
 
-                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               state->metablock_size,
                                                &tmp_next_block_ctx,
                                                mirror_num);
                        if (ret) {
@@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                        BUG_ON(NULL == l);
 
                        ret = btrfsic_read_block(state, &tmp_next_block_ctx);
-                       if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                       if (ret < (int)PAGE_CACHE_SIZE) {
                                printk(KERN_INFO
                                       "btrfsic: read @logical %llu failed!\n",
                                       (unsigned long long)
@@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                                return -1;
                        }
 
-                       hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
                        ret = btrfsic_process_metablock(state,
                                                        next_block,
                                                        &tmp_next_block_ctx,
-                                                       hdr,
                                                        BTRFS_MAX_LEVEL + 3, 1);
                        btrfsic_release_block_ctx(&tmp_next_block_ctx);
                }
@@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror(
 
        /* super block bytenr is always the unmapped device bytenr */
        dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
-       bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+       if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+               return -1;
+       bh = __bread(superblock_bdev, dev_bytenr / 4096,
+                    BTRFS_SUPER_INFO_SIZE);
        if (NULL == bh)
                return -1;
        super_tmp = (struct btrfs_super_block *)
@@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror(
        if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
            strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
                    sizeof(super_tmp->magic)) ||
-           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+           btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+           btrfs_super_leafsize(super_tmp) != state->metablock_size ||
+           btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
                brelse(bh);
                return 0;
        }
@@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror(
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror(
                        struct btrfsic_block_data_ctx tmp_next_block_ctx;
                        struct btrfsic_block_link *l;
 
-                       if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       if (btrfsic_map_block(state, next_bytenr,
+                                             state->metablock_size,
                                              &tmp_next_block_ctx,
                                              mirror_num)) {
                                printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +975,15 @@ static int btrfsic_process_metablock(
                struct btrfsic_state *state,
                struct btrfsic_block *const first_block,
                struct btrfsic_block_data_ctx *const first_block_ctx,
-               struct btrfs_header *const first_hdr,
                int first_limit_nesting, int force_iodone_flag)
 {
        struct btrfsic_stack_frame initial_stack_frame = { 0 };
        struct btrfsic_stack_frame *sf;
        struct btrfsic_stack_frame *next_stack;
+       struct btrfs_header *const first_hdr =
+               (struct btrfs_header *)first_block_ctx->datav[0];
 
+       BUG_ON(!first_hdr);
        sf = &initial_stack_frame;
        sf->error = 0;
        sf->i = -1;
@@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame:
                }
 
                if (sf->i < sf->nr) {
-                       struct btrfs_item *disk_item = leafhdr->items + sf->i;
-                       struct btrfs_disk_key *disk_key = &disk_item->key;
+                       struct btrfs_item disk_item;
+                       u32 disk_item_offset =
+                               (uintptr_t)(leafhdr->items + sf->i) -
+                               (uintptr_t)leafhdr;
+                       struct btrfs_disk_key *disk_key;
                        u8 type;
-                       const u32 item_offset = le32_to_cpu(disk_item->offset);
+                       u32 item_offset;
 
+                       if (disk_item_offset + sizeof(struct btrfs_item) >
+                           sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+                               printk(KERN_INFO
+                                      "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+                                      sf->block_ctx->start,
+                                      sf->block_ctx->dev->name);
+                               goto one_stack_frame_backwards;
+                       }
+                       btrfsic_read_from_block_data(sf->block_ctx,
+                                                    &disk_item,
+                                                    disk_item_offset,
+                                                    sizeof(struct btrfs_item));
+                       item_offset = le32_to_cpu(disk_item.offset);
+                       disk_key = &disk_item.key;
                        type = disk_key->type;
 
                        if (BTRFS_ROOT_ITEM_KEY == type) {
-                               const struct btrfs_root_item *const root_item =
-                                   (struct btrfs_root_item *)
-                                   (sf->block_ctx->data +
-                                    offsetof(struct btrfs_leaf, items) +
-                                    item_offset);
-                               const u64 next_bytenr =
-                                   le64_to_cpu(root_item->bytenr);
+                               struct btrfs_root_item root_item;
+                               u32 root_item_offset;
+                               u64 next_bytenr;
+
+                               root_item_offset = item_offset +
+                                       offsetof(struct btrfs_leaf, items);
+                               if (root_item_offset +
+                                   sizeof(struct btrfs_root_item) >
+                                   sf->block_ctx->len)
+                                       goto leaf_item_out_of_bounce_error;
+                               btrfsic_read_from_block_data(
+                                       sf->block_ctx, &root_item,
+                                       root_item_offset,
+                                       sizeof(struct btrfs_root_item));
+                               next_bytenr = le64_to_cpu(root_item.bytenr);
 
                                sf->error =
                                    btrfsic_create_link_to_next_block(
@@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame:
                                                &sf->num_copies,
                                                &sf->mirror_num,
                                                disk_key,
-                                               le64_to_cpu(root_item->
+                                               le64_to_cpu(root_item.
                                                generation));
                                if (sf->error)
                                        goto one_stack_frame_backwards;
@@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame:
                                if (NULL != sf->next_block) {
                                        struct btrfs_header *const next_hdr =
                                            (struct btrfs_header *)
-                                           sf->next_block_ctx.data;
+                                           sf->next_block_ctx.datav[0];
 
                                        next_stack =
                                            btrfsic_stack_frame_alloc();
@@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame:
                }
 
                if (sf->i < sf->nr) {
-                       struct btrfs_key_ptr *disk_key_ptr =
-                           nodehdr->ptrs + sf->i;
-                       const u64 next_bytenr =
-                           le64_to_cpu(disk_key_ptr->blockptr);
+                       struct btrfs_key_ptr key_ptr;
+                       u32 key_ptr_offset;
+                       u64 next_bytenr;
+
+                       key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+                                         (uintptr_t)nodehdr;
+                       if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+                           sf->block_ctx->len) {
+                               printk(KERN_INFO
+                                      "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+                                      sf->block_ctx->start,
+                                      sf->block_ctx->dev->name);
+                               goto one_stack_frame_backwards;
+                       }
+                       btrfsic_read_from_block_data(
+                               sf->block_ctx, &key_ptr, key_ptr_offset,
+                               sizeof(struct btrfs_key_ptr));
+                       next_bytenr = le64_to_cpu(key_ptr.blockptr);
 
                        sf->error = btrfsic_create_link_to_next_block(
                                        state,
@@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame:
                                        force_iodone_flag,
                                        &sf->num_copies,
                                        &sf->mirror_num,
-                                       &disk_key_ptr->key,
-                                       le64_to_cpu(disk_key_ptr->generation));
+                                       &key_ptr.key,
+                                       le64_to_cpu(key_ptr.generation));
                        if (sf->error)
                                goto one_stack_frame_backwards;
 
                        if (NULL != sf->next_block) {
                                struct btrfs_header *const next_hdr =
                                    (struct btrfs_header *)
-                                   sf->next_block_ctx.data;
+                                   sf->next_block_ctx.datav[0];
 
                                next_stack = btrfsic_stack_frame_alloc();
                                if (NULL == next_stack)
@@ -1181,6 +1232,35 @@ one_stack_frame_backwards:
        return sf->error;
 }
 
+static void btrfsic_read_from_block_data(
+       struct btrfsic_block_data_ctx *block_ctx,
+       void *dstv, u32 offset, size_t len)
+{
+       size_t cur;
+       size_t offset_in_page;
+       char *kaddr;
+       char *dst = (char *)dstv;
+       size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+       unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+       WARN_ON(offset + len > block_ctx->len);
+       offset_in_page = (start_offset + offset) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+       while (len > 0) {
+               cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+               BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT);
+               kaddr = block_ctx->datav[i];
+               memcpy(dst, kaddr + offset_in_page, cur);
+
+               dst += cur;
+               len -= cur;
+               offset_in_page = 0;
+               i++;
+       }
+}
+
 static int btrfsic_create_link_to_next_block(
                struct btrfsic_state *state,
                struct btrfsic_block *block,
@@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block(
        if (0 == *num_copiesp) {
                *num_copiesp =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block(
                       "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
                       *mirror_nump);
        ret = btrfsic_map_block(state, next_bytenr,
-                               BTRFSIC_BLOCK_SIZE,
+                               state->metablock_size,
                                next_block_ctx, *mirror_nump);
        if (ret) {
                printk(KERN_INFO
@@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block(
 
        if (limit_nesting > 0 && did_alloc_block_link) {
                ret = btrfsic_read_block(state, next_block_ctx);
-               if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+               if (ret < (int)next_block_ctx->len) {
                        printk(KERN_INFO
                               "btrfsic: read block @logical %llu failed!\n",
                               (unsigned long long)next_bytenr);
@@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data(
                u32 item_offset, int force_iodone_flag)
 {
        int ret;
-       struct btrfs_file_extent_item *file_extent_item =
-           (struct btrfs_file_extent_item *)(block_ctx->data +
-                                             offsetof(struct btrfs_leaf,
-                                                      items) + item_offset);
-       u64 next_bytenr =
-           le64_to_cpu(file_extent_item->disk_bytenr) +
-           le64_to_cpu(file_extent_item->offset);
-       u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
-       u64 generation = le64_to_cpu(file_extent_item->generation);
+       struct btrfs_file_extent_item file_extent_item;
+       u64 file_extent_item_offset;
+       u64 next_bytenr;
+       u64 num_bytes;
+       u64 generation;
        struct btrfsic_block_link *l;
 
+       file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+                                 item_offset;
+       if (file_extent_item_offset +
+           offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+           block_ctx->len) {
+               printk(KERN_INFO
+                      "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+                      block_ctx->start, block_ctx->dev->name);
+               return -1;
+       }
+
+       btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+               file_extent_item_offset,
+               offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+       if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+           ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                       printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+                              file_extent_item.type,
+                              (unsigned long long)
+                              le64_to_cpu(file_extent_item.disk_bytenr));
+               return 0;
+       }
+
+       if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+           block_ctx->len) {
+               printk(KERN_INFO
+                      "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+                      block_ctx->start, block_ctx->dev->name);
+               return -1;
+       }
+       btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+                                    file_extent_item_offset,
+                                    sizeof(struct btrfs_file_extent_item));
+       next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
+                     le64_to_cpu(file_extent_item.offset);
+       generation = le64_to_cpu(file_extent_item.generation);
+       num_bytes = le64_to_cpu(file_extent_item.num_bytes);
+       generation = le64_to_cpu(file_extent_item.generation);
+
        if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
                printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
                       " offset = %llu, num_bytes = %llu\n",
-                      file_extent_item->type,
+                      file_extent_item.type,
                       (unsigned long long)
-                      le64_to_cpu(file_extent_item->disk_bytenr),
-                      (unsigned long long)
-                      le64_to_cpu(file_extent_item->offset),
-                      (unsigned long long)
-                      le64_to_cpu(file_extent_item->num_bytes));
-       if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
-           ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
-               return 0;
+                      le64_to_cpu(file_extent_item.disk_bytenr),
+                      (unsigned long long)le64_to_cpu(file_extent_item.offset),
+                      (unsigned long long)num_bytes);
        while (num_bytes > 0) {
                u32 chunk_len;
                int num_copies;
                int mirror_num;
 
-               if (num_bytes > BTRFSIC_BLOCK_SIZE)
-                       chunk_len = BTRFSIC_BLOCK_SIZE;
+               if (num_bytes > state->datablock_size)
+                       chunk_len = state->datablock_size;
                else
                        chunk_len = num_bytes;
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->datablock_size);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
        block_ctx_out->dev_bytenr = multi->stripes[0].physical;
        block_ctx_out->start = bytenr;
        block_ctx_out->len = len;
-       block_ctx_out->data = NULL;
-       block_ctx_out->bh = NULL;
+       block_ctx_out->datav = NULL;
+       block_ctx_out->pagev = NULL;
+       block_ctx_out->mem_to_free = NULL;
 
        if (0 == ret)
                kfree(multi);
@@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
        block_ctx_out->dev_bytenr = bytenr;
        block_ctx_out->start = bytenr;
        block_ctx_out->len = len;
-       block_ctx_out->data = NULL;
-       block_ctx_out->bh = NULL;
+       block_ctx_out->datav = NULL;
+       block_ctx_out->pagev = NULL;
+       block_ctx_out->mem_to_free = NULL;
        if (NULL != block_ctx_out->dev) {
                return 0;
        } else {
@@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
 
 static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
 {
-       if (NULL != block_ctx->bh) {
-               brelse(block_ctx->bh);
-               block_ctx->bh = NULL;
+       if (block_ctx->mem_to_free) {
+               unsigned int num_pages;
+
+               BUG_ON(!block_ctx->datav);
+               BUG_ON(!block_ctx->pagev);
+               num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT;
+               while (num_pages > 0) {
+                       num_pages--;
+                       if (block_ctx->datav[num_pages]) {
+                               kunmap(block_ctx->pagev[num_pages]);
+                               block_ctx->datav[num_pages] = NULL;
+                       }
+                       if (block_ctx->pagev[num_pages]) {
+                               __free_page(block_ctx->pagev[num_pages]);
+                               block_ctx->pagev[num_pages] = NULL;
+                       }
+               }
+
+               kfree(block_ctx->mem_to_free);
+               block_ctx->mem_to_free = NULL;
+               block_ctx->pagev = NULL;
+               block_ctx->datav = NULL;
        }
 }
 
 static int btrfsic_read_block(struct btrfsic_state *state,
                              struct btrfsic_block_data_ctx *block_ctx)
 {
-       block_ctx->bh = NULL;
-       if (block_ctx->dev_bytenr & 4095) {
+       unsigned int num_pages;
+       unsigned int i;
+       u64 dev_bytenr;
+       int ret;
+
+       BUG_ON(block_ctx->datav);
+       BUG_ON(block_ctx->pagev);
+       BUG_ON(block_ctx->mem_to_free);
+       if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
                printk(KERN_INFO
                       "btrfsic: read_block() with unaligned bytenr %llu\n",
                       (unsigned long long)block_ctx->dev_bytenr);
                return -1;
        }
-       if (block_ctx->len > 4096) {
-               printk(KERN_INFO
-                      "btrfsic: read_block() with too huge size %d\n",
-                      block_ctx->len);
+
+       num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+                   PAGE_CACHE_SHIFT;
+       block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+                                         sizeof(*block_ctx->pagev)) *
+                                        num_pages, GFP_NOFS);
+       if (!block_ctx->mem_to_free)
                return -1;
+       block_ctx->datav = block_ctx->mem_to_free;
+       block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+       for (i = 0; i < num_pages; i++) {
+               block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+               if (!block_ctx->pagev[i])
+                       return -1;
        }
 
-       block_ctx->bh = __bread(block_ctx->dev->bdev,
-                               block_ctx->dev_bytenr >> 12, 4096);
-       if (NULL == block_ctx->bh)
-               return -1;
-       block_ctx->data = block_ctx->bh->b_data;
+       dev_bytenr = block_ctx->dev_bytenr;
+       for (i = 0; i < num_pages;) {
+               struct bio *bio;
+               unsigned int j;
+               DECLARE_COMPLETION_ONSTACK(complete);
+
+               bio = bio_alloc(GFP_NOFS, num_pages - i);
+               if (!bio) {
+                       printk(KERN_INFO
+                              "btrfsic: bio_alloc() for %u pages failed!\n",
+                              num_pages - i);
+                       return -1;
+               }
+               bio->bi_bdev = block_ctx->dev->bdev;
+               bio->bi_sector = dev_bytenr >> 9;
+               bio->bi_end_io = btrfsic_complete_bio_end_io;
+               bio->bi_private = &complete;
+
+               for (j = i; j < num_pages; j++) {
+                       ret = bio_add_page(bio, block_ctx->pagev[j],
+                                          PAGE_CACHE_SIZE, 0);
+                       if (PAGE_CACHE_SIZE != ret)
+                               break;
+               }
+               if (j == i) {
+                       printk(KERN_INFO
+                              "btrfsic: error, failed to add a single page!\n");
+                       return -1;
+               }
+               submit_bio(READ, bio);
+
+               /* this will also unplug the queue */
+               wait_for_completion(&complete);
+
+               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                       printk(KERN_INFO
+                              "btrfsic: read error at logical %llu dev %s!\n",
+                              block_ctx->start, block_ctx->dev->name);
+                       bio_put(bio);
+                       return -1;
+               }
+               bio_put(bio);
+               dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+               i = j;
+       }
+       for (i = 0; i < num_pages; i++) {
+               block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+               if (!block_ctx->datav[i]) {
+                       printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+                              block_ctx->dev->name);
+                       return -1;
+               }
+       }
 
        return block_ctx->len;
 }
 
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+{
+       complete((struct completion *)bio->bi_private);
+}
+
 static void btrfsic_dump_database(struct btrfsic_state *state)
 {
        struct list_head *elem_all;
@@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
  * (note that this test fails for the super block)
  */
 static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-                                    const u8 *data, unsigned int size)
+                                    char **datav, unsigned int num_pages)
 {
        struct btrfs_header *h;
        u8 csum[BTRFS_CSUM_SIZE];
        u32 crc = ~(u32)0;
-       int fail = 0;
-       int crc_fail = 0;
+       unsigned int i;
 
-       h = (struct btrfs_header *)data;
+       if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+               return 1; /* not metadata */
+       num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+       h = (struct btrfs_header *)datav[0];
 
        if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
-               fail++;
+               return 1;
+
+       for (i = 0; i < num_pages; i++) {
+               u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+               size_t sublen = i ? PAGE_CACHE_SIZE :
+                                   (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
 
-       crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+               crc = crc32c(crc, data, sublen);
+       }
        btrfs_csum_final(crc, csum);
        if (memcmp(csum, h->csum, state->csum_size))
-               crc_fail++;
+               return 1;
 
-       return fail || crc_fail;
+       return 0; /* is metadata */
 }
 
 static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-                                         u64 dev_bytenr,
-                                         u8 *mapped_data, unsigned int len,
-                                         struct bio *bio,
-                                         int *bio_is_patched,
+                                         u64 dev_bytenr, char **mapped_datav,
+                                         unsigned int num_pages,
+                                         struct bio *bio, int *bio_is_patched,
                                          struct buffer_head *bh,
                                          int submit_bio_bh_rw)
 {
@@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
        int ret;
        struct btrfsic_state *state = dev_state->state;
        struct block_device *bdev = dev_state->bdev;
+       unsigned int processed_len;
 
-       WARN_ON(len > PAGE_SIZE);
-       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
        if (NULL != bio_is_patched)
                *bio_is_patched = 0;
 
+again:
+       if (num_pages == 0)
+               return;
+
+       processed_len = 0;
+       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+                                                     num_pages));
+
        block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
                                               &state->block_hashtable);
        if (NULL != block) {
@@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
 
                if (block->is_superblock) {
                        bytenr = le64_to_cpu(((struct btrfs_super_block *)
-                                             mapped_data)->bytenr);
+                                             mapped_datav[0])->bytenr);
+                       if (num_pages * PAGE_CACHE_SIZE <
+                           BTRFS_SUPER_INFO_SIZE) {
+                               printk(KERN_INFO
+                                      "btrfsic: cannot work with too short bios!\n");
+                               return;
+                       }
                        is_metadata = 1;
+                       BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+                       processed_len = BTRFS_SUPER_INFO_SIZE;
                        if (state->print_mask &
                            BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
                                printk(KERN_INFO
@@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                }
                if (is_metadata) {
                        if (!block->is_superblock) {
+                               if (num_pages * PAGE_CACHE_SIZE <
+                                   state->metablock_size) {
+                                       printk(KERN_INFO
+                                              "btrfsic: cannot work with too short bios!\n");
+                                       return;
+                               }
+                               processed_len = state->metablock_size;
                                bytenr = le64_to_cpu(((struct btrfs_header *)
-                                                     mapped_data)->bytenr);
+                                                     mapped_datav[0])->bytenr);
                                btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
                                                               dev_state,
-                                                              dev_bytenr,
-                                                              mapped_data);
+                                                              dev_bytenr);
                        }
                        if (block->logical_bytenr != bytenr) {
                                printk(KERN_INFO
@@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                       block->mirror_num,
                                       btrfsic_get_block_type(state, block));
                } else {
+                       if (num_pages * PAGE_CACHE_SIZE <
+                           state->datablock_size) {
+                               printk(KERN_INFO
+                                      "btrfsic: cannot work with too short bios!\n");
+                               return;
+                       }
+                       processed_len = state->datablock_size;
                        bytenr = block->logical_bytenr;
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                printk(KERN_INFO
@@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                               le64_to_cpu(block->disk_key.offset),
                               (unsigned long long)
                               le64_to_cpu(((struct btrfs_header *)
-                                           mapped_data)->generation),
+                                           mapped_datav[0])->generation),
                               (unsigned long long)
                               state->max_superblock_generation);
                        btrfsic_dump_tree(state);
@@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                               (unsigned long long)block->generation,
                               (unsigned long long)
                               le64_to_cpu(((struct btrfs_header *)
-                                           mapped_data)->generation));
+                                           mapped_datav[0])->generation));
                        /* it would not be safe to go on */
                        btrfsic_dump_tree(state);
-                       return;
+                       goto continue_loop;
                }
 
                /*
@@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                }
 
                if (block->is_superblock)
-                       ret = btrfsic_map_superblock(state, bytenr, len,
+                       ret = btrfsic_map_superblock(state, bytenr,
+                                                    processed_len,
                                                     bdev, &block_ctx);
                else
-                       ret = btrfsic_map_block(state, bytenr, len,
+                       ret = btrfsic_map_block(state, bytenr, processed_len,
                                                &block_ctx, 0);
                if (ret) {
                        printk(KERN_INFO
                               "btrfsic: btrfsic_map_block(root @%llu)"
                               " failed!\n", (unsigned long long)bytenr);
-                       return;
+                       goto continue_loop;
                }
-               block_ctx.data = mapped_data;
+               block_ctx.datav = mapped_datav;
                /* the following is required in case of writes to mirrors,
                 * use the same that was used for the lookup */
                block_ctx.dev = dev_state;
@@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                        block->logical_bytenr = bytenr;
                        block->is_metadata = 1;
                        if (block->is_superblock) {
+                               BUG_ON(PAGE_CACHE_SIZE !=
+                                      BTRFS_SUPER_INFO_SIZE);
                                ret = btrfsic_process_written_superblock(
                                                state,
                                                block,
                                                (struct btrfs_super_block *)
-                                               mapped_data);
+                                               mapped_datav[0]);
                                if (state->print_mask &
                                    BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
                                        printk(KERN_INFO
@@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                                state,
                                                block,
                                                &block_ctx,
-                                               (struct btrfs_header *)
-                                               block_ctx.data,
                                                0, 0);
                        }
                        if (ret)
@@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                u64 bytenr;
 
                if (!is_metadata) {
+                       processed_len = state->datablock_size;
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                printk(KERN_INFO "Written block (%s/%llu/?)"
                                       " !found in hash table, D.\n",
                                       dev_state->name,
                                       (unsigned long long)dev_bytenr);
-                       if (!state->include_extent_data)
-                               return; /* ignore that written D block */
+                       if (!state->include_extent_data) {
+                               /* ignore that written D block */
+                               goto continue_loop;
+                       }
 
                        /* this is getting ugly for the
                         * include_extent_data case... */
                        bytenr = 0;     /* unknown */
                        block_ctx.start = bytenr;
-                       block_ctx.len = len;
-                       block_ctx.bh = NULL;
+                       block_ctx.len = processed_len;
+                       block_ctx.mem_to_free = NULL;
+                       block_ctx.pagev = NULL;
                } else {
+                       processed_len = state->metablock_size;
                        bytenr = le64_to_cpu(((struct btrfs_header *)
-                                             mapped_data)->bytenr);
+                                             mapped_datav[0])->bytenr);
                        btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
-                                                      dev_bytenr,
-                                                      mapped_data);
+                                                      dev_bytenr);
                        if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                printk(KERN_INFO
                                       "Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                       dev_state->name,
                                       (unsigned long long)dev_bytenr);
 
-                       ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
-                                               0);
+                       ret = btrfsic_map_block(state, bytenr, processed_len,
+                                               &block_ctx, 0);
                        if (ret) {
                                printk(KERN_INFO
                                       "btrfsic: btrfsic_map_block(root @%llu)"
                                       " failed!\n",
                                       (unsigned long long)dev_bytenr);
-                               return;
+                               goto continue_loop;
                        }
                }
-               block_ctx.data = mapped_data;
+               block_ctx.datav = mapped_datav;
                /* the following is required in case of writes to mirrors,
                 * use the same that was used for the lookup */
                block_ctx.dev = dev_state;
@@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                if (NULL == block) {
                        printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
                        btrfsic_release_block_ctx(&block_ctx);
-                       return;
+                       goto continue_loop;
                }
                block->dev_state = dev_state;
                block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
 
                if (is_metadata) {
                        ret = btrfsic_process_metablock(state, block,
-                                                       &block_ctx,
-                                                       (struct btrfs_header *)
-                                                       block_ctx.data, 0, 0);
+                                                       &block_ctx, 0, 0);
                        if (ret)
                                printk(KERN_INFO
                                       "btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                }
                btrfsic_release_block_ctx(&block_ctx);
        }
+
+continue_loop:
+       BUG_ON(!processed_len);
+       dev_bytenr += processed_len;
+       mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+       num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+       goto again;
 }
 
 static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock(
 
                num_copies =
                    btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, BTRFS_SUPER_INFO_SIZE);
                if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                        printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                               (unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock(
                                printk(KERN_INFO
                                       "btrfsic_process_written_superblock("
                                       "mirror_num=%d)\n", mirror_num);
-                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               BTRFS_SUPER_INFO_SIZE,
                                                &tmp_next_block_ctx,
                                                mirror_num);
                        if (ret) {
@@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
 static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                                           u64 bytenr,
                                           struct btrfsic_dev_state *dev_state,
-                                          u64 dev_bytenr, char *data)
+                                          u64 dev_bytenr)
 {
        int num_copies;
        int mirror_num;
@@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
        int match = 0;
 
        num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                     bytenr, PAGE_SIZE);
+                                     bytenr, state->metablock_size);
 
        for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-               ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+               ret = btrfsic_map_block(state, bytenr, state->metablock_size,
                                        &block_ctx, mirror_num);
                if (ret) {
                        printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                       (unsigned long long)bytenr, dev_state->name,
                       (unsigned long long)dev_bytenr);
                for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-                       ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, bytenr,
+                                               state->metablock_size,
                                                &block_ctx, mirror_num);
                        if (ret)
                                continue;
@@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
                               (unsigned long)bh->b_size, bh->b_data,
                               bh->b_bdev);
                btrfsic_process_written_block(dev_state, dev_bytenr,
-                                             bh->b_data, bh->b_size, NULL,
+                                             &bh->b_data, 1, NULL,
                                              NULL, bh, rw);
        } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
                if (dev_state->state->print_mask &
                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
                        printk(KERN_INFO
-                              "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+                              "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
                               rw, bh->b_bdev);
                if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
                        if ((dev_state->state->print_mask &
@@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                unsigned int i;
                u64 dev_bytenr;
                int bio_is_patched;
+               char **mapped_datav;
 
                dev_bytenr = 512 * bio->bi_sector;
                bio_is_patched = 0;
@@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                               (unsigned long long)dev_bytenr,
                               bio->bi_bdev);
 
+               mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+                                      GFP_NOFS);
+               if (!mapped_datav)
+                       goto leave;
                for (i = 0; i < bio->bi_vcnt; i++) {
-                       u8 *mapped_data;
-
-                       mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+                       BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+                       mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+                       if (!mapped_datav[i]) {
+                               while (i > 0) {
+                                       i--;
+                                       kunmap(bio->bi_io_vec[i].bv_page);
+                               }
+                               kfree(mapped_datav);
+                               goto leave;
+                       }
                        if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                             BTRFSIC_PRINT_MASK_VERBOSE) ==
                            (dev_state->state->print_mask &
                             (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                              BTRFSIC_PRINT_MASK_VERBOSE)))
                                printk(KERN_INFO
-                                      "#%u: page=%p, mapped=%p, len=%u,"
-                                      " offset=%u\n",
+                                      "#%u: page=%p, len=%u, offset=%u\n",
                                       i, bio->bi_io_vec[i].bv_page,
-                                      mapped_data,
                                       bio->bi_io_vec[i].bv_len,
                                       bio->bi_io_vec[i].bv_offset);
-                       btrfsic_process_written_block(dev_state, dev_bytenr,
-                                                     mapped_data,
-                                                     bio->bi_io_vec[i].bv_len,
-                                                     bio, &bio_is_patched,
-                                                     NULL, rw);
+               }
+               btrfsic_process_written_block(dev_state, dev_bytenr,
+                                             mapped_datav, bio->bi_vcnt,
+                                             bio, &bio_is_patched,
+                                             NULL, rw);
+               while (i > 0) {
+                       i--;
                        kunmap(bio->bi_io_vec[i].bv_page);
-                       dev_bytenr += bio->bi_io_vec[i].bv_len;
                }
+               kfree(mapped_datav);
        } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
                if (dev_state->state->print_mask &
                    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
                        printk(KERN_INFO
-                              "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+                              "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
                               rw, bio->bi_bdev);
                if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
                        if ((dev_state->state->print_mask &
@@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                        bio->bi_end_io = btrfsic_bio_end_io;
                }
        }
+leave:
        mutex_unlock(&btrfsic_mutex);
 
        submit_bio(rw, bio);
@@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root,
        struct list_head *dev_head = &fs_devices->devices;
        struct btrfs_device *device;
 
+       if (root->nodesize != root->leafsize) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
+                      root->nodesize, root->leafsize);
+               return -1;
+       }
+       if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
+       if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
+       if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
        state = kzalloc(sizeof(*state), GFP_NOFS);
        if (NULL == state) {
                printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root,
        state->print_mask = print_mask;
        state->include_extent_data = including_extent_data;
        state->csum_size = 0;
+       state->metablock_size = root->nodesize;
+       state->datablock_size = root->sectorsize;
        INIT_LIST_HEAD(&state->all_blocks_list);
        btrfsic_block_hashtable_init(&state->block_hashtable);
        btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root,
                                btrfsic_block_link_free(l);
                }
 
-               if (b_all->is_iodone)
+               if (b_all->is_iodone || b_all->never_written)
                        btrfsic_block_free(b_all);
                else
                        printk(KERN_INFO "btrfs: attempt to free %c-block"
index 4106264fbc655ac79b26efa1177384ea92b72988..d7a96cfdc50ae6a2d8afef1dad7ca3642248bbb8 100644 (file)
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/rbtree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *dst_buf,
                              struct extent_buffer *src_buf);
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct btrfs_path *path, int level, int slot);
+                   struct btrfs_path *path, int level, int slot,
+                   int tree_mod_log);
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                                struct extent_buffer *eb);
+struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
+                                         u32 blocksize, u64 parent_transid,
+                                         u64 time_seq);
+struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
+                                               u64 bytenr, u32 blocksize,
+                                               u64 time_seq);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
                                     new_root_objectid, &disk_key, level,
-                                    buf->start, 0, 1);
+                                    buf->start, 0);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+enum mod_log_op {
+       MOD_LOG_KEY_REPLACE,
+       MOD_LOG_KEY_ADD,
+       MOD_LOG_KEY_REMOVE,
+       MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+       MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+       MOD_LOG_MOVE_KEYS,
+       MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+       int dst_slot;
+       int nr_items;
+};
+
+struct tree_mod_root {
+       u64 logical;
+       u8 level;
+};
+
+struct tree_mod_elem {
+       struct rb_node node;
+       u64 index;              /* shifted logical */
+       struct seq_list elem;
+       enum mod_log_op op;
+
+       /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+       int slot;
+
+       /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+       u64 generation;
+
+       /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+       struct btrfs_disk_key key;
+       u64 blockptr;
+
+       /* this is used for op == MOD_LOG_MOVE_KEYS */
+       struct tree_mod_move move;
+
+       /* this is used for op == MOD_LOG_ROOT_REPLACE */
+       struct tree_mod_root old_root;
+};
+
+static inline void
+__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+{
+       elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
+       list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+}
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem)
+{
+       elem->flags = 1;
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       __get_tree_mod_seq(fs_info, elem);
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem)
+{
+       struct rb_root *tm_root;
+       struct rb_node *node;
+       struct rb_node *next;
+       struct seq_list *cur_elem;
+       struct tree_mod_elem *tm;
+       u64 min_seq = (u64)-1;
+       u64 seq_putting = elem->seq;
+
+       if (!seq_putting)
+               return;
+
+       BUG_ON(!(elem->flags & 1));
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       list_del(&elem->list);
+
+       list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+               if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+                       if (seq_putting > cur_elem->seq) {
+                               /*
+                                * blocker with lower sequence number exists, we
+                                * cannot remove anything from the log
+                                */
+                               goto out;
+                       }
+                       min_seq = cur_elem->seq;
+               }
+       }
+
+       /*
+        * anything that's lower than the lowest existing (read: blocked)
+        * sequence number can be removed from the tree.
+        */
+       write_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       for (node = rb_first(tm_root); node; node = next) {
+               next = rb_next(node);
+               tm = container_of(node, struct tree_mod_elem, node);
+               if (tm->elem.seq > min_seq)
+                       continue;
+               rb_erase(node, tm_root);
+               list_del(&tm->elem.list);
+               kfree(tm);
+       }
+       write_unlock(&fs_info->tree_mod_log_lock);
+out:
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+/*
+ * key order of the log:
+ *       index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+       struct rb_root *tm_root;
+       struct rb_node **new;
+       struct rb_node *parent = NULL;
+       struct tree_mod_elem *cur;
+       int ret = 0;
+
+       BUG_ON(!tm || !tm->elem.seq);
+
+       write_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       new = &tm_root->rb_node;
+       while (*new) {
+               cur = container_of(*new, struct tree_mod_elem, node);
+               parent = *new;
+               if (cur->index < tm->index)
+                       new = &((*new)->rb_left);
+               else if (cur->index > tm->index)
+                       new = &((*new)->rb_right);
+               else if (cur->elem.seq < tm->elem.seq)
+                       new = &((*new)->rb_left);
+               else if (cur->elem.seq > tm->elem.seq)
+                       new = &((*new)->rb_right);
+               else {
+                       kfree(tm);
+                       ret = -EEXIST;
+                       goto unlock;
+               }
+       }
+
+       rb_link_node(&tm->node, parent, new);
+       rb_insert_color(&tm->node, tm_root);
+unlock:
+       write_unlock(&fs_info->tree_mod_log_lock);
+       return ret;
+}
+
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+                                   struct extent_buffer *eb) {
+       smp_mb();
+       if (list_empty(&(fs_info)->tree_mod_seq_list))
+               return 1;
+       if (!eb)
+               return 0;
+       if (btrfs_header_level(eb) == 0)
+               return 1;
+       return 0;
+}
+
+static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
+                                struct tree_mod_elem **tm_ret)
+{
+       struct tree_mod_elem *tm;
+       int seq;
+
+       if (tree_mod_dont_log(fs_info, NULL))
+               return 0;
+
+       tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+       if (!tm)
+               return -ENOMEM;
+
+       tm->elem.flags = 0;
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       if (list_empty(&fs_info->tree_mod_seq_list)) {
+               /*
+                * someone emptied the list while we were waiting for the lock.
+                * we must not add to the list, because no blocker exists. items
+                * are removed from the list only when the existing blocker is
+                * removed from the list.
+                */
+               kfree(tm);
+               seq = 0;
+       } else {
+               __get_tree_mod_seq(fs_info, &tm->elem);
+               seq = tm->elem.seq;
+       }
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+
+       return seq;
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+                            struct extent_buffer *eb, int slot,
+                            enum mod_log_op op, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = eb->start >> PAGE_CACHE_SHIFT;
+       if (op != MOD_LOG_KEY_ADD) {
+               btrfs_node_key(eb, &tm->key, slot);
+               tm->blockptr = btrfs_node_blockptr(eb, slot);
+       }
+       tm->op = op;
+       tm->slot = slot;
+       tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                       int slot, enum mod_log_op op)
+{
+       return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+                        struct extent_buffer *eb, int dst_slot, int src_slot,
+                        int nr_items, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+       int i;
+
+       if (tree_mod_dont_log(fs_info, eb))
+               return 0;
+
+       for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+               ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+                                             MOD_LOG_KEY_REMOVE_WHILE_MOVING);
+               BUG_ON(ret < 0);
+       }
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = eb->start >> PAGE_CACHE_SHIFT;
+       tm->slot = src_slot;
+       tm->move.dst_slot = dst_slot;
+       tm->move.nr_items = nr_items;
+       tm->op = MOD_LOG_MOVE_KEYS;
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+                        struct extent_buffer *old_root,
+                        struct extent_buffer *new_root, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+       tm->old_root.logical = old_root->start;
+       tm->old_root.level = btrfs_header_level(old_root);
+       tm->generation = btrfs_header_generation(old_root);
+       tm->op = MOD_LOG_ROOT_REPLACE;
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+                     int smallest)
+{
+       struct rb_root *tm_root;
+       struct rb_node *node;
+       struct tree_mod_elem *cur = NULL;
+       struct tree_mod_elem *found = NULL;
+       u64 index = start >> PAGE_CACHE_SHIFT;
+
+       read_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       node = tm_root->rb_node;
+       while (node) {
+               cur = container_of(node, struct tree_mod_elem, node);
+               if (cur->index < index) {
+                       node = node->rb_left;
+               } else if (cur->index > index) {
+                       node = node->rb_right;
+               } else if (cur->elem.seq < min_seq) {
+                       node = node->rb_left;
+               } else if (!smallest) {
+                       /* we want the node with the highest seq */
+                       if (found)
+                               BUG_ON(found->elem.seq > cur->elem.seq);
+                       found = cur;
+                       node = node->rb_left;
+               } else if (cur->elem.seq > min_seq) {
+                       /* we want the node with the smallest seq */
+                       if (found)
+                               BUG_ON(found->elem.seq < cur->elem.seq);
+                       found = cur;
+                       node = node->rb_right;
+               } else {
+                       found = cur;
+                       break;
+               }
+       }
+       read_unlock(&fs_info->tree_mod_log_lock);
+
+       return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+                          u64 min_seq)
+{
+       return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+       return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static inline void
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+                    struct extent_buffer *src, unsigned long dst_offset,
+                    unsigned long src_offset, int nr_items)
+{
+       int ret;
+       int i;
+
+       if (tree_mod_dont_log(fs_info, NULL))
+               return;
+
+       if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+               return;
+
+       /* speed this up by single seq for all operations? */
+       for (i = 0; i < nr_items; i++) {
+               ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
+                                             MOD_LOG_KEY_REMOVE);
+               BUG_ON(ret < 0);
+               ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
+                                             MOD_LOG_KEY_ADD);
+               BUG_ON(ret < 0);
+       }
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+                    int dst_offset, int src_offset, int nr_items)
+{
+       int ret;
+       ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+                                      nr_items, GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
+static inline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+                         struct extent_buffer *eb,
+                         struct btrfs_disk_key *disk_key, int slot, int atomic)
+{
+       int ret;
+
+       ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
+                                          MOD_LOG_KEY_REPLACE,
+                                          atomic ? GFP_ATOMIC : GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                                struct extent_buffer *eb)
+{
+       int i;
+       int ret;
+       u32 nritems;
+
+       if (tree_mod_dont_log(fs_info, eb))
+               return;
+
+       nritems = btrfs_header_nritems(eb);
+       for (i = nritems - 1; i >= 0; i--) {
+               ret = tree_mod_log_insert_key(fs_info, eb, i,
+                                             MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+               BUG_ON(ret < 0);
+       }
+}
+
+static inline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+                             struct extent_buffer *new_root_node)
+{
+       int ret;
+       tree_mod_log_free_eb(root->fs_info, root->node);
+       ret = tree_mod_log_insert_root(root->fs_info, root->node,
+                                      new_root_node, GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
 /*
  * check if the tree block can be shared by multiple trees
  */
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        ret = btrfs_dec_ref(trans, root, buf, 1, 1);
                        BUG_ON(ret); /* -ENOMEM */
                }
+               /*
+                * don't log freeing in case we're freeing the root node, this
+                * is done by tree_mod_log_set_root_pointer later
+                */
+               if (buf != root->node && btrfs_header_level(buf) != 0)
+                       tree_mod_log_free_eb(root->fs_info, buf);
                clean_tree_block(trans, root, buf);
                *last_ref = 1;
        }
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
        cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
                                     root->root_key.objectid, &disk_key,
-                                    level, search_start, empty_size, 1);
+                                    level, search_start, empty_size);
        if (IS_ERR(cow))
                return PTR_ERR(cow);
 
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                        parent_start = 0;
 
                extent_buffer_get(cow);
+               tree_mod_log_set_root_pointer(root, cow);
                rcu_assign_pointer(root->node, cow);
 
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref, 1);
+                                     last_ref);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                        parent_start = 0;
 
                WARN_ON(trans->transid != btrfs_header_generation(parent));
+               tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+                                       MOD_LOG_KEY_REPLACE);
                btrfs_set_node_blockptr(parent, parent_slot,
                                        cow->start);
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref, 1);
+                                     last_ref);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        return 0;
 }
 
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+                          struct btrfs_root *root, u64 time_seq)
+{
+       struct tree_mod_elem *tm;
+       struct tree_mod_elem *found = NULL;
+       u64 root_logical = root->node->start;
+       int looped = 0;
+
+       if (!time_seq)
+               return 0;
+
+       /*
+        * the very last operation that's logged for a root is the replacement
+        * operation (if it is replaced at all). this has the index of the *new*
+        * root, making it the very first operation that's logged for this root.
+        */
+       while (1) {
+               tm = tree_mod_log_search_oldest(fs_info, root_logical,
+                                               time_seq);
+               if (!looped && !tm)
+                       return 0;
+               /*
+                * we must have key remove operations in the log before the
+                * replace operation.
+                */
+               BUG_ON(!tm);
+
+               if (tm->op != MOD_LOG_ROOT_REPLACE)
+                       break;
+
+               found = tm;
+               root_logical = tm->old_root.logical;
+               BUG_ON(root_logical == root->node->start);
+               looped = 1;
+       }
+
+       return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
+                     struct tree_mod_elem *first_tm)
+{
+       u32 n;
+       struct rb_node *next;
+       struct tree_mod_elem *tm = first_tm;
+       unsigned long o_dst;
+       unsigned long o_src;
+       unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+       n = btrfs_header_nritems(eb);
+       while (tm && tm->elem.seq >= time_seq) {
+               /*
+                * all the operations are recorded with the operator used for
+                * the modification. as we're going backwards, we do the
+                * opposite of each operation here.
+                */
+               switch (tm->op) {
+               case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+                       BUG_ON(tm->slot < n);
+               case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+               case MOD_LOG_KEY_REMOVE:
+                       btrfs_set_node_key(eb, &tm->key, tm->slot);
+                       btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+                       btrfs_set_node_ptr_generation(eb, tm->slot,
+                                                     tm->generation);
+                       n++;
+                       break;
+               case MOD_LOG_KEY_REPLACE:
+                       BUG_ON(tm->slot >= n);
+                       btrfs_set_node_key(eb, &tm->key, tm->slot);
+                       btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+                       btrfs_set_node_ptr_generation(eb, tm->slot,
+                                                     tm->generation);
+                       break;
+               case MOD_LOG_KEY_ADD:
+                       if (tm->slot != n - 1) {
+                               o_dst = btrfs_node_key_ptr_offset(tm->slot);
+                               o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
+                               memmove_extent_buffer(eb, o_dst, o_src, p_size);
+                       }
+                       n--;
+                       break;
+               case MOD_LOG_MOVE_KEYS:
+                       o_dst = btrfs_node_key_ptr_offset(tm->slot);
+                       o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+                       memmove_extent_buffer(eb, o_dst, o_src,
+                                             tm->move.nr_items * p_size);
+                       break;
+               case MOD_LOG_ROOT_REPLACE:
+                       /*
+                        * this operation is special. for roots, this must be
+                        * handled explicitly before rewinding.
+                        * for non-roots, this operation may exist if the node
+                        * was a root: root A -> child B; then A gets empty and
+                        * B is promoted to the new root. in the mod log, we'll
+                        * have a root-replace operation for B, a tree block
+                        * that is no root. we simply ignore that operation.
+                        */
+                       break;
+               }
+               next = rb_next(&tm->node);
+               if (!next)
+                       break;
+               tm = container_of(next, struct tree_mod_elem, node);
+               if (tm->index != first_tm->index)
+                       break;
+       }
+       btrfs_set_header_nritems(eb, n);
+}
+
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                   u64 time_seq)
+{
+       struct extent_buffer *eb_rewin;
+       struct tree_mod_elem *tm;
+
+       if (!time_seq)
+               return eb;
+
+       if (btrfs_header_level(eb) == 0)
+               return eb;
+
+       tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+       if (!tm)
+               return eb;
+
+       if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+               BUG_ON(tm->slot != 0);
+               eb_rewin = alloc_dummy_extent_buffer(eb->start,
+                                               fs_info->tree_root->nodesize);
+               BUG_ON(!eb_rewin);
+               btrfs_set_header_bytenr(eb_rewin, eb->start);
+               btrfs_set_header_backref_rev(eb_rewin,
+                                            btrfs_header_backref_rev(eb));
+               btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+               btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+       } else {
+               eb_rewin = btrfs_clone_extent_buffer(eb);
+               BUG_ON(!eb_rewin);
+       }
+
+       extent_buffer_get(eb_rewin);
+       free_extent_buffer(eb);
+
+       __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+
+       return eb_rewin;
+}
+
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+       struct tree_mod_elem *tm;
+       struct extent_buffer *eb;
+       struct tree_mod_root *old_root;
+       u64 old_generation;
+
+       tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+       if (!tm)
+               return root->node;
+
+       old_root = &tm->old_root;
+       old_generation = tm->generation;
+
+       tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
+       /*
+        * there was an item in the log when __tree_mod_log_oldest_root
+        * returned. this one must not go away, because the time_seq passed to
+        * us must be blocking its removal.
+        */
+       BUG_ON(!tm);
+
+       if (old_root->logical == root->node->start) {
+               /* there are logged operations for the current root */
+               eb = btrfs_clone_extent_buffer(root->node);
+       } else {
+               /* there's a root replace operation for the current root */
+               eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
+                                              root->nodesize);
+               btrfs_set_header_bytenr(eb, eb->start);
+               btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+               btrfs_set_header_owner(eb, root->root_key.objectid);
+       }
+       if (!eb)
+               return NULL;
+       btrfs_set_header_level(eb, old_root->level);
+       btrfs_set_header_generation(eb, old_generation);
+       __tree_mod_log_rewind(eb, time_seq, tm);
+
+       return eb;
+}
+
 static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root,
                                   struct extent_buffer *buf)
@@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                                if (!cur)
                                        return -EIO;
                        } else if (!uptodate) {
-                               btrfs_read_buffer(cur, gen);
+                               err = btrfs_read_buffer(cur, gen);
+                               if (err) {
+                                       free_extent_buffer(cur);
+                                       return err;
+                               }
                        }
                }
                if (search_start == 0)
@@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                      int level, int *slot)
 {
-       if (level == 0) {
+       if (level == 0)
                return generic_bin_search(eb,
                                          offsetof(struct btrfs_leaf, items),
                                          sizeof(struct btrfs_item),
                                          key, btrfs_header_nritems(eb),
                                          slot);
-       } else {
+       else
                return generic_bin_search(eb,
                                          offsetof(struct btrfs_node, ptrs),
                                          sizeof(struct btrfs_key_ptr),
                                          key, btrfs_header_nritems(eb),
                                          slot);
-       }
-       return -1;
 }
 
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        goto enospc;
                }
 
+               tree_mod_log_set_root_pointer(root, child);
                rcu_assign_pointer(root->node, child);
 
                add_root_to_dirty_list(root);
@@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                free_extent_buffer(mid);
 
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+               btrfs_free_tree_block(trans, root, mid, 0, 1);
                /* once for the root ptr */
                free_extent_buffer_stale(mid);
                return 0;
@@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (btrfs_header_nritems(right) == 0) {
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                       del_ptr(trans, root, path, level + 1, pslot + 1);
+                       del_ptr(trans, root, path, level + 1, pslot + 1, 1);
                        root_sub_used(root, right->len);
-                       btrfs_free_tree_block(trans, root, right, 0, 1, 0);
+                       btrfs_free_tree_block(trans, root, right, 0, 1);
                        free_extent_buffer_stale(right);
                        right = NULL;
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &right_key, pslot + 1, 0);
                        btrfs_set_node_key(parent, &right_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
                }
@@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        if (btrfs_header_nritems(mid) == 0) {
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-               del_ptr(trans, root, path, level + 1, pslot);
+               del_ptr(trans, root, path, level + 1, pslot, 1);
                root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+               btrfs_free_tree_block(trans, root, mid, 0, 1);
                free_extent_buffer_stale(mid);
                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
                btrfs_node_key(mid, &mid_key, 0);
+               tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+                                         pslot, 0);
                btrfs_set_node_key(parent, &mid_key, pslot);
                btrfs_mark_buffer_dirty(parent);
        }
@@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key disk_key;
                        orig_slot += left_nr;
                        btrfs_node_key(mid, &disk_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &disk_key, pslot, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot);
                        btrfs_mark_buffer_dirty(parent);
                        if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        struct btrfs_disk_key disk_key;
 
                        btrfs_node_key(right, &disk_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &disk_key, pslot + 1, 0);
                        btrfs_set_node_key(parent, &disk_key, pslot + 1);
                        btrfs_mark_buffer_dirty(parent);
 
@@ -1496,7 +2158,7 @@ static int
 read_block_for_search(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct btrfs_path *p,
                       struct extent_buffer **eb_ret, int level, int slot,
-                      struct btrfs_key *key)
+                      struct btrfs_key *key, u64 time_seq)
 {
        u64 blocknr;
        u64 gen;
@@ -1850,7 +2512,7 @@ cow_done:
                        }
 
                        err = read_block_for_search(trans, root, p,
-                                                   &b, level, slot, key);
+                                                   &b, level, slot, key, 0);
                        if (err == -EAGAIN)
                                goto again;
                        if (err) {
@@ -1921,6 +2583,115 @@ done:
        return ret;
 }
 
+/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+                         struct btrfs_path *p, u64 time_seq)
+{
+       struct extent_buffer *b;
+       int slot;
+       int ret;
+       int err;
+       int level;
+       int lowest_unlock = 1;
+       u8 lowest_level = 0;
+
+       lowest_level = p->lowest_level;
+       WARN_ON(p->nodes[0] != NULL);
+
+       if (p->search_commit_root) {
+               BUG_ON(time_seq);
+               return btrfs_search_slot(NULL, root, key, p, 0, 0);
+       }
+
+again:
+       b = get_old_root(root, time_seq);
+       extent_buffer_get(b);
+       level = btrfs_header_level(b);
+       btrfs_tree_read_lock(b);
+       p->locks[level] = BTRFS_READ_LOCK;
+
+       while (b) {
+               level = btrfs_header_level(b);
+               p->nodes[level] = b;
+               btrfs_clear_path_blocking(p, NULL, 0);
+
+               /*
+                * we have a lock on b and as long as we aren't changing
+                * the tree, there is no way to for the items in b to change.
+                * It is safe to drop the lock on our parent before we
+                * go through the expensive btree search on b.
+                */
+               btrfs_unlock_up_safe(p, level + 1);
+
+               ret = bin_search(b, key, level, &slot);
+
+               if (level != 0) {
+                       int dec = 0;
+                       if (ret && slot > 0) {
+                               dec = 1;
+                               slot -= 1;
+                       }
+                       p->slots[level] = slot;
+                       unlock_up(p, level, lowest_unlock, 0, NULL);
+
+                       if (level == lowest_level) {
+                               if (dec)
+                                       p->slots[level]++;
+                               goto done;
+                       }
+
+                       err = read_block_for_search(NULL, root, p, &b, level,
+                                                   slot, key, time_seq);
+                       if (err == -EAGAIN)
+                               goto again;
+                       if (err) {
+                               ret = err;
+                               goto done;
+                       }
+
+                       level = btrfs_header_level(b);
+                       err = btrfs_try_tree_read_lock(b);
+                       if (!err) {
+                               btrfs_set_path_blocking(p);
+                               btrfs_tree_read_lock(b);
+                               btrfs_clear_path_blocking(p, b,
+                                                         BTRFS_READ_LOCK);
+                       }
+                       p->locks[level] = BTRFS_READ_LOCK;
+                       p->nodes[level] = b;
+                       b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+                       if (b != p->nodes[level]) {
+                               btrfs_tree_unlock_rw(p->nodes[level],
+                                                    p->locks[level]);
+                               p->locks[level] = 0;
+                               p->nodes[level] = b;
+                       }
+               } else {
+                       p->slots[level] = slot;
+                       unlock_up(p, level, lowest_unlock, 0, NULL);
+                       goto done;
+               }
+       }
+       ret = 1;
+done:
+       if (!p->leave_spinning)
+               btrfs_set_path_blocking(p);
+       if (ret < 0)
+               btrfs_release_path(p);
+
+       return ret;
+}
+
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
@@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
                if (!path->nodes[i])
                        break;
                t = path->nodes[i];
+               tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
                btrfs_set_node_key(t, key, tslot);
                btrfs_mark_buffer_dirty(path->nodes[i]);
                if (tslot != 0)
@@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
        } else
                push_items = min(src_nritems - 8, push_items);
 
+       tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+                            push_items);
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(dst_nritems),
                           btrfs_node_key_ptr_offset(0),
                           push_items * sizeof(struct btrfs_key_ptr));
 
        if (push_items < src_nritems) {
+               tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
+                                    src_nritems - push_items);
                memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
                                      btrfs_node_key_ptr_offset(push_items),
                                      (src_nritems - push_items) *
@@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
        if (max_push < push_items)
                push_items = max_push;
 
+       tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
        memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
                                      btrfs_node_key_ptr_offset(0),
                                      (dst_nritems) *
                                      sizeof(struct btrfs_key_ptr));
 
+       tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+                            src_nritems - push_items, push_items);
        copy_extent_buffer(dst, src,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 
        c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                   root->root_key.objectid, &lower_key,
-                                  level, root->node->start, 0, 0);
+                                  level, root->node->start, 0);
        if (IS_ERR(c))
                return PTR_ERR(c);
 
@@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
 
        old = root->node;
+       tree_mod_log_set_root_pointer(root, c);
        rcu_assign_pointer(root->node, c);
 
        /* the super has an extra ref to root->node */
@@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 static void insert_ptr(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct btrfs_path *path,
                       struct btrfs_disk_key *key, u64 bytenr,
-                      int slot, int level)
+                      int slot, int level, int tree_mod_log)
 {
        struct extent_buffer *lower;
        int nritems;
+       int ret;
 
        BUG_ON(!path->nodes[level]);
        btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
        BUG_ON(slot > nritems);
        BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
        if (slot != nritems) {
+               if (tree_mod_log && level)
+                       tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+                                            slot, nritems - slot);
                memmove_extent_buffer(lower,
                              btrfs_node_key_ptr_offset(slot + 1),
                              btrfs_node_key_ptr_offset(slot),
                              (nritems - slot) * sizeof(struct btrfs_key_ptr));
        }
+       if (tree_mod_log && level) {
+               ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+                                             MOD_LOG_KEY_ADD);
+               BUG_ON(ret < 0);
+       }
        btrfs_set_node_key(lower, key, slot);
        btrfs_set_node_blockptr(lower, slot, bytenr);
        WARN_ON(trans->transid == 0);
@@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 
        split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, level, c->start, 0, 0);
+                                       &disk_key, level, c->start, 0);
        if (IS_ERR(split))
                return PTR_ERR(split);
 
@@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_chunk_tree_uuid(split),
                            BTRFS_UUID_SIZE);
 
-
+       tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
        copy_extent_buffer(split, c,
                           btrfs_node_key_ptr_offset(0),
                           btrfs_node_key_ptr_offset(mid),
@@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(split);
 
        insert_ptr(trans, root, path, &disk_key, split->start,
-                  path->slots[level + 1] + 1, level + 1);
+                  path->slots[level + 1] + 1, level + 1, 1);
 
        if (path->slots[level] >= mid) {
                path->slots[level] -= mid;
@@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
        btrfs_set_header_nritems(l, mid);
        btrfs_item_key(right, &disk_key, 0);
        insert_ptr(trans, root, path, &disk_key, right->start,
-                  path->slots[1] + 1, 1);
+                  path->slots[1] + 1, 1, 0);
 
        btrfs_mark_buffer_dirty(right);
        btrfs_mark_buffer_dirty(l);
@@ -3004,7 +3793,7 @@ again:
 
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
-                                       &disk_key, 0, l->start, 0, 0);
+                                       &disk_key, 0, l->start, 0);
        if (IS_ERR(right))
                return PTR_ERR(right);
 
@@ -3028,7 +3817,7 @@ again:
                if (mid <= slot) {
                        btrfs_set_header_nritems(right, 0);
                        insert_ptr(trans, root, path, &disk_key, right->start,
-                                  path->slots[1] + 1, 1);
+                                  path->slots[1] + 1, 1, 0);
                        btrfs_tree_unlock(path->nodes[0]);
                        free_extent_buffer(path->nodes[0]);
                        path->nodes[0] = right;
@@ -3037,7 +3826,7 @@ again:
                } else {
                        btrfs_set_header_nritems(right, 0);
                        insert_ptr(trans, root, path, &disk_key, right->start,
-                                         path->slots[1], 1);
+                                         path->slots[1], 1, 0);
                        btrfs_tree_unlock(path->nodes[0]);
                        free_extent_buffer(path->nodes[0]);
                        path->nodes[0] = right;
@@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
  * empty a node.
  */
 static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                   struct btrfs_path *path, int level, int slot)
+                   struct btrfs_path *path, int level, int slot,
+                   int tree_mod_log)
 {
        struct extent_buffer *parent = path->nodes[level];
        u32 nritems;
+       int ret;
 
        nritems = btrfs_header_nritems(parent);
        if (slot != nritems - 1) {
+               if (tree_mod_log && level)
+                       tree_mod_log_eb_move(root->fs_info, parent, slot,
+                                            slot + 1, nritems - slot - 1);
                memmove_extent_buffer(parent,
                              btrfs_node_key_ptr_offset(slot),
                              btrfs_node_key_ptr_offset(slot + 1),
                              sizeof(struct btrfs_key_ptr) *
                              (nritems - slot - 1));
+       } else if (tree_mod_log && level) {
+               ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+                                             MOD_LOG_KEY_REMOVE);
+               BUG_ON(ret < 0);
        }
+
        nritems--;
        btrfs_set_header_nritems(parent, nritems);
        if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
                                    struct extent_buffer *leaf)
 {
        WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-       del_ptr(trans, root, path, 1, path->slots[1]);
+       del_ptr(trans, root, path, 1, path->slots[1], 1);
 
        /*
         * btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
        root_sub_used(root, leaf->len);
 
        extent_buffer_get(leaf);
-       btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+       btrfs_free_tree_block(trans, root, leaf, 0, 1);
        free_extent_buffer_stale(leaf);
 }
 /*
@@ -4271,7 +5070,7 @@ again:
                next = c;
                next_rw_lock = path->locks[level];
                ret = read_block_for_search(NULL, root, path, &next, level,
-                                           slot, &key);
+                                           slot, &key, 0);
                if (ret == -EAGAIN)
                        goto again;
 
@@ -4308,7 +5107,7 @@ again:
                        break;
 
                ret = read_block_for_search(NULL, root, path, &next, level,
-                                           0, &key);
+                                           0, &key, 0);
                if (ret == -EAGAIN)
                        goto again;
 
index 8fd72331d6008c100e48db1c808566eb382187b2..0236d03c6732569a48a561049ea5a861d473da65 100644 (file)
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_FT_XATTR         8
 #define BTRFS_FT_MAX           9
 
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
        u8 csum;
 } __attribute__ ((__packed__));
 
+struct btrfs_dev_stats_item {
+       /*
+        * grow this item struct at the end for future enhancements and keep
+        * the existing values unchanged
+        */
+       __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
 /* different types of block groups (and chunks) */
 #define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
 #define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
        spinlock_t delayed_iput_lock;
        struct list_head delayed_iputs;
 
+       /* this protects tree_mod_seq_list */
+       spinlock_t tree_mod_seq_lock;
+       atomic_t tree_mod_seq;
+       struct list_head tree_mod_seq_list;
+
+       /* this protects tree_mod_log */
+       rwlock_t tree_mod_log_lock;
+       struct rb_root tree_mod_log;
+
        atomic_t nr_async_submits;
        atomic_t async_submit_draining;
        atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
        struct list_head root_list;
 
        spinlock_t orphan_lock;
-       struct list_head orphan_list;
+       atomic_t orphan_inodes;
        struct btrfs_block_rsv *orphan_block_rsv;
        int orphan_item_inserted;
        int orphan_cleanup_state;
@@ -1507,6 +1527,12 @@ struct btrfs_ioctl_defrag_range_args {
 
 #define BTRFS_BALANCE_ITEM_KEY 248
 
+/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY    249
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
        return btrfs_item_size(eb, e) - offset;
 }
 
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+                                       struct btrfs_dev_stats_item *ptr,
+                                       int index)
+{
+       u64 val;
+
+       read_extent_buffer(eb, &val,
+                          offsetof(struct btrfs_dev_stats_item, values) +
+                           ((unsigned long)ptr) + (index * sizeof(u64)),
+                          sizeof(val));
+       return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+                                            struct btrfs_dev_stats_item *ptr,
+                                            int index, u64 val)
+{
+       write_extent_buffer(eb, &val,
+                           offsetof(struct btrfs_dev_stats_item, values) +
+                            ((unsigned long)ptr) + (index * sizeof(u64)),
+                           sizeof(val));
+}
+
 static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size, int for_cow);
+                                       u64 hint, u64 empty_size);
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref, int for_cow);
+                          u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, struct btrfs_path *p, int
                      ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+                         struct btrfs_path *p, u64 time_seq);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
                       int start_slot, int cache_only, u64 *last_ret,
@@ -2922,7 +2974,6 @@ int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
 int btrfs_dirty_inode(struct inode *inode);
-int btrfs_update_time(struct file *file);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
@@ -3098,4 +3149,23 @@ void btrfs_reada_detach(void *handle);
 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                         u64 start, int err);
 
+/* delayed seq elem */
+struct seq_list {
+       struct list_head list;
+       u64 seq;
+       u32 flags;
+};
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+
+static inline int is_fstree(u64 rootid)
+{
+       if (rootid == BTRFS_FS_TREE_OBJECTID ||
+           (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+               return 1;
+       return 0;
+}
 #endif
index 03e3748d84d02407c19c6d46648667a56f13ba3e..c18d0442ae6daa69a564ebba400f9ad09573ea1d 100644 (file)
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
                return ret;
        } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
                spin_lock(&BTRFS_I(inode)->lock);
-               if (BTRFS_I(inode)->delalloc_meta_reserved) {
-                       BTRFS_I(inode)->delalloc_meta_reserved = 0;
+               if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                                      &BTRFS_I(inode)->runtime_flags)) {
                        spin_unlock(&BTRFS_I(inode)->lock);
                        release = true;
                        goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
        btrfs_set_stack_inode_generation(inode_item,
                                         BTRFS_I(inode)->generation);
-       btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+       btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
        btrfs_set_stack_inode_transid(inode_item, trans->transid);
        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
        set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
        inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
        BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
-       BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+       inode->i_version = btrfs_stack_inode_sequence(inode_item);
        inode->i_rdev = 0;
        *rdev = btrfs_stack_inode_rdev(inode_item);
        BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
index 69f22e3ab3bc307974b5cae14f99310a498b54cf..13ae7b04790eaff72e8c23fb145fca8bfae88175 100644 (file)
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        ref->is_head = 0;
        ref->in_tree = 1;
 
-       if (need_ref_seq(for_cow, ref_root))
+       if (is_fstree(ref_root))
                seq = inc_delayed_seq(delayed_refs);
        ref->seq = seq;
 
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        ref->is_head = 0;
        ref->in_tree = 1;
 
-       if (need_ref_seq(for_cow, ref_root))
+       if (is_fstree(ref_root))
                seq = inc_delayed_seq(delayed_refs);
        ref->seq = seq;
 
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, level, action,
                                   for_cow);
-       if (!need_ref_seq(for_cow, ref_root) &&
+       if (!is_fstree(ref_root) &&
            waitqueue_active(&delayed_refs->seq_wait))
                wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
+
        return 0;
 }
 
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
                                   num_bytes, parent, ref_root, owner, offset,
                                   action, for_cow);
-       if (!need_ref_seq(for_cow, ref_root) &&
+       if (!is_fstree(ref_root) &&
            waitqueue_active(&delayed_refs->seq_wait))
                wake_up(&delayed_refs->seq_wait);
        spin_unlock(&delayed_refs->lock);
+
        return 0;
 }
 
index d8f244d9492511e3b108b26bcf4da1bc9fbf6826..413927fb9957e41fdcfb82511e63d416b8a36c76 100644 (file)
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
 
-struct seq_list {
-       struct list_head list;
-       u64 seq;
-};
-
 static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
 {
        assert_spin_locked(&delayed_refs->lock);
@@ -229,25 +224,6 @@ btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
 int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
                            u64 seq);
 
-/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
-       if (for_cow)
-               return 0;
-
-       if (rootid == BTRFS_FS_TREE_OBJECTID)
-               return 1;
-
-       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
-               return 1;
-
-       return 0;
-}
-
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
index e1fe74a2ce16e6a4e0b38129160f484e642c42fa..7ae51decf6d3d0fb5c3d44bb7791f843c74aa376 100644 (file)
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->orphan_block_rsv = NULL;
 
        INIT_LIST_HEAD(&root->dirty_list);
-       INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        atomic_set(&root->log_commit[0], 0);
        atomic_set(&root->log_commit[1], 0);
        atomic_set(&root->log_writers, 0);
+       atomic_set(&root->orphan_inodes, 0);
        root->log_batch = 0;
        root->log_transid = 0;
        root->last_log_commit = 0;
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                      BTRFS_TREE_LOG_OBJECTID, NULL,
-                                     0, 0, 0, 0);
+                                     0, 0, 0);
        if (IS_ERR(leaf)) {
                kfree(root);
                return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
        spin_lock_init(&fs_info->free_chunk_lock);
+       spin_lock_init(&fs_info->tree_mod_seq_lock);
+       rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
 
        init_completion(&fs_info->kobj_unregister);
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
+       INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
        btrfs_mapping_init(&fs_info->mapping_tree);
        btrfs_init_block_rsv(&fs_info->global_block_rsv);
        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
        atomic_set(&fs_info->defrag_running, 0);
+       atomic_set(&fs_info->tree_mod_seq, 0);
        fs_info->sb = sb;
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->trans_no_join = 0;
        fs_info->free_chunk_space = 0;
+       fs_info->tree_mod_log = RB_ROOT;
 
        /* readahead state */
        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb,
        BTRFS_I(fs_info->btree_inode)->root = tree_root;
        memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
               sizeof(struct btrfs_key));
-       BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+       set_bit(BTRFS_INODE_DUMMY,
+               &BTRFS_I(fs_info->btree_inode)->runtime_flags);
        insert_inode_hash(fs_info->btree_inode);
 
        spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2353,6 +2359,13 @@ retry_root_backup:
        fs_info->generation = generation;
        fs_info->last_trans_committed = generation;
 
+       ret = btrfs_init_dev_stats(fs_info);
+       if (ret) {
+               printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+                      ret);
+               goto fail_block_groups;
+       }
+
        ret = btrfs_init_space_info(fs_info);
        if (ret) {
                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2556,18 +2569,19 @@ recovery_tree_root:
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-       char b[BDEVNAME_SIZE];
-
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
+               struct btrfs_device *device = (struct btrfs_device *)
+                       bh->b_private;
+
                printk_ratelimited(KERN_WARNING "lost page write due to "
-                                       "I/O error on %s\n",
-                                      bdevname(bh->b_bdev, b));
+                                  "I/O error on %s\n", device->name);
                /* note, we dont' set_buffer_write_io_error because we have
                 * our own ways of dealing with the IO errors
                 */
                clear_buffer_uptodate(bh);
+               btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
        }
        unlock_buffer(bh);
        put_bh(bh);
@@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device,
                        set_buffer_uptodate(bh);
                        lock_buffer(bh);
                        bh->b_end_io = btrfs_end_buffer_write_sync;
+                       bh->b_private = device;
                }
 
                /*
@@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
                }
                if (!bio_flagged(bio, BIO_UPTODATE)) {
                        ret = -EIO;
+                       if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+                               btrfs_dev_stat_inc_and_print(device,
+                                       BTRFS_DEV_STAT_FLUSH_ERRS);
                }
 
                /* drop the reference from the wait == 0 run */
@@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
        return ret;
 }
 
-/* Kill all outstanding I/O */
-void btrfs_abort_devices(struct btrfs_root *root)
-{
-       struct list_head *head;
-       struct btrfs_device *dev;
-       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-       head = &root->fs_info->fs_devices->devices;
-       list_for_each_entry_rcu(dev, head, dev_list) {
-               blk_abort_queue(dev->bdev->bd_disk->queue);
-       }
-       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-}
-
 void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
        spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
        return 0;
 }
 
-static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
-                                         u64 start, u64 end,
-                                         struct extent_state *state)
-{
-       struct super_block *sb = page->mapping->host->i_sb;
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-       btrfs_error(fs_info, -EIO,
-                   "Error occured while writing out btree at %llu", start);
-       return -EIO;
-}
-
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = {
        .submit_bio_hook = btree_submit_bio_hook,
        /* note we're sharing with inode.c for the merge bio hook */
        .merge_bio_hook = btrfs_merge_bio_hook,
-       .writepage_io_failed_hook = btree_writepage_io_failed_hook,
 };
index ab1830aaf0edbffba6a0cef86d13e9b3f2742cda..05b3fab39f7e814fc8c958e125f5a14c7e39d7f9 100644 (file)
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 int btrfs_cleanup_transaction(struct btrfs_root *root);
 void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
                                  struct btrfs_root *root);
-void btrfs_abort_devices(struct btrfs_root *root);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
index e887ee62b6d4ba0a98f7e2437323eecfca88bf23..614f34a899c2db468792f1ef8406c5a366739258 100644 (file)
                                             parent_root_objectid) / 4)
 #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
 
-static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
-                          int connectable)
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+                          struct inode *parent)
 {
        struct btrfs_fid *fid = (struct btrfs_fid *)fh;
-       struct inode *inode = dentry->d_inode;
        int len = *max_len;
        int type;
 
-       if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+       if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
                *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                return 255;
        } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
@@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        fid->root_objectid = BTRFS_I(inode)->root->objectid;
        fid->gen = inode->i_generation;
 
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
+       if (parent) {
                u64 parent_root_id;
 
-               spin_lock(&dentry->d_lock);
-
-               parent = dentry->d_parent->d_inode;
                fid->parent_objectid = BTRFS_I(parent)->location.objectid;
                fid->parent_gen = parent->i_generation;
                parent_root_id = BTRFS_I(parent)->root->objectid;
 
-               spin_unlock(&dentry->d_lock);
-
                if (parent_root_id != fid->root_objectid) {
                        fid->parent_root_objectid = parent_root_id;
                        len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
index 49fd7b66d57b272c7aeaea7db4b1bbd0985f8aa2..4b5a1e1bdefbe095c239b464e55c9699e865175b 100644 (file)
@@ -3578,7 +3578,7 @@ again:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
 out:
-       mutex_unlock(&extent_root->fs_info->chunk_mutex);
+       mutex_unlock(&fs_info->chunk_mutex);
        return ret;
 }
 
@@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
        BTRFS_I(inode)->outstanding_extents--;
 
        if (BTRFS_I(inode)->outstanding_extents == 0 &&
-           BTRFS_I(inode)->delalloc_meta_reserved) {
+           test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                              &BTRFS_I(inode)->runtime_flags))
                drop_inode_space = 1;
-               BTRFS_I(inode)->delalloc_meta_reserved = 0;
-       }
 
        /*
         * If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
         * Add an item to reserve for updating the inode when we complete the
         * delalloc io.
         */
-       if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+       if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                     &BTRFS_I(inode)->runtime_flags)) {
                nr_extents++;
                extra_reserve = 1;
        }
@@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
        spin_lock(&BTRFS_I(inode)->lock);
        if (extra_reserve) {
-               BTRFS_I(inode)->delalloc_meta_reserved = 1;
+               set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                       &BTRFS_I(inode)->runtime_flags);
                nr_extents--;
        }
        BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5218,7 @@ out:
 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
-                          u64 parent, int last_ref, int for_cow)
+                          u64 parent, int last_ref)
 {
        struct btrfs_block_group_cache *cache = NULL;
        int ret;
@@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                                        buf->start, buf->len,
                                        parent, root->root_key.objectid,
                                        btrfs_header_level(buf),
-                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+                                       BTRFS_DROP_DELAYED_REF, NULL, 0);
                BUG_ON(ret); /* -ENOMEM */
        }
 
@@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        struct btrfs_root *root, u32 blocksize,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size, int for_cow)
+                                       u64 hint, u64 empty_size)
 {
        struct btrfs_key ins;
        struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        ins.objectid,
                                        ins.offset, parent, root_objectid,
                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op, for_cow);
+                                       extent_op, 0);
                BUG_ON(ret); /* -ENOMEM */
        }
        return buf;
@@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
 
-       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
+       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
index c9018a05036e943a52ad91d81019bb4b934b6b9a..2c8f7b2046173954f720125a6e53e96de3c7727e 100644 (file)
@@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                        return parent;
        }
 
-       entry = rb_entry(node, struct tree_entry, rb_node);
        rb_link_node(node, parent, p);
        rb_insert_color(node, root);
        return NULL;
@@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state)
 
 /*
  * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1)
+ * it will optionally wake up any one waiting on this state (wake == 1).
  *
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
@@ -570,10 +569,8 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                       clear_state_bit(tree, state, &bits, wake);
-                       if (last_end == (u64)-1)
-                               goto out;
-                       start = last_end + 1;
+                       state = clear_state_bit(tree, state, &bits, wake);
+                       goto next;
                }
                goto search_again;
        }
@@ -781,7 +778,6 @@ hit_next:
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
-               struct rb_node *next_node;
                if (state->state & exclusive_bits) {
                        *failed_start = state->start;
                        err = -EEXIST;
@@ -789,20 +785,15 @@ hit_next:
                }
 
                set_state_bits(tree, state, &bits);
-
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
-
                start = last_end + 1;
-               next_node = rb_next(&state->rb_node);
-               if (next_node && start < end && prealloc && !need_resched()) {
-                       state = rb_entry(next_node, struct extent_state,
-                                        rb_node);
-                       if (state->start == start)
-                               goto hit_next;
-               }
+               state = next_state(state);
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
                goto search_again;
        }
 
@@ -845,6 +836,10 @@ hit_next:
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
+                       state = next_state(state);
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
                }
                goto search_again;
        }
@@ -994,21 +989,14 @@ hit_next:
         * Just lock what we found and keep going
         */
        if (state->start == start && state->end <= end) {
-               struct rb_node *next_node;
-
                set_state_bits(tree, state, &bits);
-               clear_state_bit(tree, state, &clear_bits, 0);
+               state = clear_state_bit(tree, state, &clear_bits, 0);
                if (last_end == (u64)-1)
                        goto out;
-
                start = last_end + 1;
-               next_node = rb_next(&state->rb_node);
-               if (next_node && start < end && prealloc && !need_resched()) {
-                       state = rb_entry(next_node, struct extent_state,
-                                        rb_node);
-                       if (state->start == start)
-                               goto hit_next;
-               }
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
                goto search_again;
        }
 
@@ -1042,10 +1030,13 @@ hit_next:
                        goto out;
                if (state->end <= end) {
                        set_state_bits(tree, state, &bits);
-                       clear_state_bit(tree, state, &clear_bits, 0);
+                       state = clear_state_bit(tree, state, &clear_bits, 0);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
                }
                goto search_again;
        }
@@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                              cached_state, mask);
 }
 
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-                                u64 end, struct extent_state **cached_state,
-                                gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                         struct extent_state **cached_state, gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
                                cached_state, mask);
@@ -1293,7 +1283,7 @@ out:
  * returned if we find something, and *start_ret and *end_ret are
  * set to reflect the state struct that was found.
  *
- * If nothing was found, 1 is returned, < 0 on error
+ * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
                          u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
                /* try to remap that extent elsewhere? */
                bio_put(bio);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
                return -EIO;
        }
 
@@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
                        uptodate = 0;
        }
 
-       if (!uptodate && tree->ops &&
-           tree->ops->writepage_io_failed_hook) {
-               ret = tree->ops->writepage_io_failed_hook(NULL, page,
-                                                start, end, NULL);
-               /* Writeback already completed */
-               if (ret == 0)
-                       return 1;
-       }
-
        if (!uptodate) {
-               clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
                ClearPageUptodate(page);
                SetPageError(page);
        }
@@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
                                                              state, mirror);
-                       if (ret)
+                       if (ret) {
+                               /* no IO indicated but software detected errors
+                                * in the block, either checksum errors or
+                                * issues with the contents */
+                               struct btrfs_root *root =
+                                       BTRFS_I(page->mapping->host)->root;
+                               struct btrfs_device *device;
+
                                uptodate = 0;
-                       else
+                               device = btrfs_find_device_for_logical(
+                                               root, start, mirror);
+                               if (device)
+                                       btrfs_dev_stat_inc_and_print(device,
+                                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
+                       } else {
                                clean_io_failure(start, page);
+                       }
                }
 
                if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb,
        u64 offset = eb->start;
        unsigned long i, num_pages;
        int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
-       int ret;
+       int ret = 0;
 
        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
@@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        eb->start = start;
        eb->len = len;
        eb->tree = tree;
+       eb->bflags = 0;
        rwlock_init(&eb->lock);
        atomic_set(&eb->write_locks, 0);
        atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        return eb;
 }
 
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+       unsigned long i;
+       struct page *p;
+       struct extent_buffer *new;
+       unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+       new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
+       if (new == NULL)
+               return NULL;
+
+       for (i = 0; i < num_pages; i++) {
+               p = alloc_page(GFP_ATOMIC);
+               BUG_ON(!p);
+               attach_extent_buffer_page(new, p);
+               WARN_ON(PageDirty(p));
+               SetPageUptodate(p);
+               new->pages[i] = p;
+       }
+
+       copy_extent_buffer(new, src, 0, 0, src->len);
+       set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+       set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+       return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+{
+       struct extent_buffer *eb;
+       unsigned long num_pages = num_extent_pages(0, len);
+       unsigned long i;
+
+       eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
+       if (!eb)
+               return NULL;
+
+       for (i = 0; i < num_pages; i++) {
+               eb->pages[i] = alloc_page(GFP_ATOMIC);
+               if (!eb->pages[i])
+                       goto err;
+       }
+       set_extent_buffer_uptodate(eb);
+       btrfs_set_header_nritems(eb, 0);
+       set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+       return eb;
+err:
+       for (i--; i > 0; i--)
+               __free_page(eb->pages[i]);
+       __free_extent_buffer(eb);
+       return NULL;
+}
+
 static int extent_buffer_under_io(struct extent_buffer *eb)
 {
        return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                                                unsigned long start_idx)
 {
        unsigned long index;
+       unsigned long num_pages;
        struct page *page;
+       int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
 
        BUG_ON(extent_buffer_under_io(eb));
 
-       index = num_extent_pages(eb->start, eb->len);
+       num_pages = num_extent_pages(eb->start, eb->len);
+       index = start_idx + num_pages;
        if (start_idx >= index)
                return;
 
        do {
                index--;
                page = extent_buffer_page(eb, index);
-               if (page) {
+               if (page && mapped) {
                        spin_lock(&page->mapping->private_lock);
                        /*
                         * We do this since we'll remove the pages after we've
@@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                        }
                        spin_unlock(&page->mapping->private_lock);
 
+               }
+               if (page) {
                        /* One for when we alloced the page */
                        page_cache_release(page);
                }
@@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 {
        WARN_ON(atomic_read(&eb->refs) == 0);
        if (atomic_dec_and_test(&eb->refs)) {
-               struct extent_io_tree *tree = eb->tree;
+               if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+                       spin_unlock(&eb->refs_lock);
+               } else {
+                       struct extent_io_tree *tree = eb->tree;
 
-               spin_unlock(&eb->refs_lock);
+                       spin_unlock(&eb->refs_lock);
 
-               spin_lock(&tree->buffer_lock);
-               radix_tree_delete(&tree->buffer,
-                                 eb->start >> PAGE_CACHE_SHIFT);
-               spin_unlock(&tree->buffer_lock);
+                       spin_lock(&tree->buffer_lock);
+                       radix_tree_delete(&tree->buffer,
+                                         eb->start >> PAGE_CACHE_SHIFT);
+                       spin_unlock(&tree->buffer_lock);
+               }
 
                /* Should be safe to release our pages at this point */
                btrfs_release_extent_buffer_page(eb, 0);
@@ -4259,6 +4317,10 @@ void free_extent_buffer(struct extent_buffer *eb)
                return;
 
        spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) == 2 &&
+           test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+               atomic_dec(&eb->refs);
+
        if (atomic_read(&eb->refs) == 2 &&
            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
            !extent_buffer_under_io(eb) &&
index b516c3b8dec68d825e380a1930976f34c8a3e1a4..25900af5b15d43e6bdfe0cef7c865ac2aa81bd36 100644 (file)
@@ -39,6 +39,7 @@
 #define EXTENT_BUFFER_STALE 6
 #define EXTENT_BUFFER_WRITEBACK 7
 #define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
                              unsigned long bio_flags);
        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
-       int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
-                                       u64 start, u64 end,
-                                      struct extent_state *state);
        int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
                                    struct extent_state *state, int mirror);
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                         struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
index 53bf2d764bbc4f5814db04710d3123d03c3779ba..70dc8ca73e257bc3a1e7a96ea48009bff093af9a 100644 (file)
@@ -65,6 +65,21 @@ struct inode_defrag {
        int cycled;
 };
 
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+                                 struct inode_defrag *defrag2)
+{
+       if (defrag1->root > defrag2->root)
+               return 1;
+       else if (defrag1->root < defrag2->root)
+               return -1;
+       else if (defrag1->ino > defrag2->ino)
+               return 1;
+       else if (defrag1->ino < defrag2->ino)
+               return -1;
+       else
+               return 0;
+}
+
 /* pop a record for an inode into the defrag tree.  The lock
  * must be held already
  *
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
        struct inode_defrag *entry;
        struct rb_node **p;
        struct rb_node *parent = NULL;
+       int ret;
 
        p = &root->fs_info->defrag_inodes.rb_node;
        while (*p) {
                parent = *p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);
 
-               if (defrag->ino < entry->ino)
+               ret = __compare_inode_defrag(defrag, entry);
+               if (ret < 0)
                        p = &parent->rb_left;
-               else if (defrag->ino > entry->ino)
+               else if (ret > 0)
                        p = &parent->rb_right;
                else {
                        /* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
                        goto exists;
                }
        }
-       BTRFS_I(inode)->in_defrag = 1;
+       set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
        rb_link_node(&defrag->rb_node, parent, p);
        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
        return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        if (btrfs_fs_closing(root->fs_info))
                return 0;
 
-       if (BTRFS_I(inode)->in_defrag)
+       if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
                return 0;
 
        if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
        defrag->root = root->root_key.objectid;
 
        spin_lock(&root->fs_info->defrag_inodes_lock);
-       if (!BTRFS_I(inode)->in_defrag)
+       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
                __btrfs_add_inode_defrag(inode, defrag);
        else
                kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 /*
  * must be called with the defrag_inodes lock held
  */
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
+                                            u64 root, u64 ino,
                                             struct rb_node **next)
 {
        struct inode_defrag *entry = NULL;
+       struct inode_defrag tmp;
        struct rb_node *p;
        struct rb_node *parent = NULL;
+       int ret;
+
+       tmp.ino = ino;
+       tmp.root = root;
 
        p = info->defrag_inodes.rb_node;
        while (p) {
                parent = p;
                entry = rb_entry(parent, struct inode_defrag, rb_node);
 
-               if (ino < entry->ino)
+               ret = __compare_inode_defrag(&tmp, entry);
+               if (ret < 0)
                        p = parent->rb_left;
-               else if (ino > entry->ino)
+               else if (ret > 0)
                        p = parent->rb_right;
                else
                        return entry;
        }
 
        if (next) {
-               while (parent && ino > entry->ino) {
+               while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
                        parent = rb_next(parent);
                        entry = rb_entry(parent, struct inode_defrag, rb_node);
                }
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
        struct btrfs_key key;
        struct btrfs_ioctl_defrag_range_args range;
        u64 first_ino = 0;
+       u64 root_objectid = 0;
        int num_defrag;
        int defrag_batch = 1024;
 
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                n = NULL;
 
                /* find an inode to defrag */
-               defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+               defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
+                                                first_ino, &n);
                if (!defrag) {
-                       if (n)
-                               defrag = rb_entry(n, struct inode_defrag, rb_node);
-                       else if (first_ino) {
+                       if (n) {
+                               defrag = rb_entry(n, struct inode_defrag,
+                                                 rb_node);
+                       } else if (root_objectid || first_ino) {
+                               root_objectid = 0;
                                first_ino = 0;
                                continue;
                        } else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 
                /* remove it from the rbtree */
                first_ino = defrag->ino + 1;
+               root_objectid = defrag->root;
                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
 
                if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                        goto next;
 
                /* do a chunk of defrag */
-               BTRFS_I(inode)->in_defrag = 0;
+               clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
                range.start = defrag->last_offset;
                num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
                                               defrag_batch);
@@ -1404,12 +1433,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                goto out;
        }
 
-       err = btrfs_update_time(file);
+       err = file_update_time(file);
        if (err) {
                mutex_unlock(&inode->i_mutex);
                goto out;
        }
-       BTRFS_I(inode)->sequence++;
 
        start_pos = round_down(pos, root->sectorsize);
        if (start_pos > i_size_read(inode)) {
@@ -1466,8 +1494,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
         * flush down new bytes that may have been written if the
         * application were using truncate to replace a file in place.
         */
-       if (BTRFS_I(inode)->ordered_data_close) {
-               BTRFS_I(inode)->ordered_data_close = 0;
+       if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                              &BTRFS_I(inode)->runtime_flags)) {
                btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                        filemap_flush(inode->i_mapping);
@@ -1498,14 +1526,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
        trace_btrfs_sync_file(file, datasync);
 
-       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       if (ret)
-               return ret;
        mutex_lock(&inode->i_mutex);
 
-       /* we wait first, since the writeback may change the inode */
+       /*
+        * we wait first, since the writeback may change the inode, also wait
+        * ordered range does a filemape_write_and_wait_range which is why we
+        * don't do it above like other file systems.
+        */
        root->log_batch++;
-       btrfs_wait_ordered_range(inode, 0, (u64)-1);
+       btrfs_wait_ordered_range(inode, start, end);
        root->log_batch++;
 
        /*
@@ -1523,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * syncing
         */
        smp_mb();
-       if (BTRFS_I(inode)->last_trans <=
+       if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+           BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
                mutex_unlock(&inode->i_mutex);
index 202008ec367d4c4c2cfcf73f7289692dd910b25c..81296c57405a5d53a27dba626a4d6201829bd578 100644 (file)
@@ -33,6 +33,8 @@
 
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+                             struct btrfs_free_space *info);
 
 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
                                               struct btrfs_path *path,
@@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
                return ERR_PTR(-ENOENT);
        }
 
-       inode->i_mapping->flags &= ~__GFP_FS;
+       mapping_set_gfp_mask(inode->i_mapping,
+                       mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 
        return inode;
 }
@@ -365,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
 
 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 {
-       u64 *val;
+       __le64 *val;
 
        io_ctl_map_page(io_ctl, 1);
 
@@ -388,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 
 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 {
-       u64 *gen;
+       __le64 *gen;
 
        /*
         * Skip the crc area.  If we don't check crcs then we just have a 64bit
@@ -584,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
        return 0;
 }
 
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries.  This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache.  So run through the space cache that we just
+ * loaded and merge contiguous entries.  This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+       struct btrfs_free_space *e, *prev = NULL;
+       struct rb_node *n;
+
+again:
+       spin_lock(&ctl->tree_lock);
+       for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+               e = rb_entry(n, struct btrfs_free_space, offset_index);
+               if (!prev)
+                       goto next;
+               if (e->bitmap || prev->bitmap)
+                       goto next;
+               if (prev->offset + prev->bytes == e->offset) {
+                       unlink_free_space(ctl, prev);
+                       unlink_free_space(ctl, e);
+                       prev->bytes += e->bytes;
+                       kmem_cache_free(btrfs_free_space_cachep, e);
+                       link_free_space(ctl, prev);
+                       prev = NULL;
+                       spin_unlock(&ctl->tree_lock);
+                       goto again;
+               }
+next:
+               prev = e;
+       }
+       spin_unlock(&ctl->tree_lock);
+}
+
 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                            struct btrfs_free_space_ctl *ctl,
                            struct btrfs_path *path, u64 offset)
@@ -726,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        }
 
        io_ctl_drop_pages(&io_ctl);
+       merge_space_tree(ctl);
        ret = 1;
 out:
        io_ctl_free(&io_ctl);
@@ -972,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                goto out;
 
 
-       ret = filemap_write_and_wait(inode->i_mapping);
-       if (ret)
-               goto out;
+       btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
index ceb7b9c9edcc1436693178fd6d2ff62f2334ada7..f6ab6f5e635a39b18ddb7f259bf5f0edd25d10a0 100644 (file)
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 
 static int btrfs_setsize(struct inode *inode, loff_t newsize);
 static int btrfs_truncate(struct inode *inode);
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
                                   u64 start, u64 end, int *page_started,
@@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
        ret = insert_inline_extent(trans, root, inode, start,
                                   inline_len, compressed_size,
                                   compress_type, compressed_pages);
-       if (ret) {
+       if (ret && ret != -ENOSPC) {
                btrfs_abort_transaction(trans, root, ret);
                return ret;
+       } else if (ret == -ENOSPC) {
+               return 1;
        }
+
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
@@ -1572,11 +1575,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        if (btrfs_is_free_space_inode(root, inode))
                metadata = 2;
 
-       ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
-       if (ret)
-               return ret;
-
        if (!(rw & REQ_WRITE)) {
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+               if (ret)
+                       return ret;
+
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
                        return btrfs_submit_compressed_read(inode, bio,
                                                    mirror_num, bio_flags);
@@ -1815,25 +1818,24 @@ out:
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
  */
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 {
+       struct inode *inode = ordered_extent->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans = NULL;
-       struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
        int compress_type = 0;
        int ret;
        bool nolock;
 
-       ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
-                                            end - start + 1);
-       if (!ret)
-               return 0;
-       BUG_ON(!ordered_extent); /* Logic error */
-
        nolock = btrfs_is_free_space_inode(root, inode);
 
+       if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+               ret = -EIO;
+               goto out;
+       }
+
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1889,12 +1891,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                   ordered_extent->file_offset,
                                   ordered_extent->len);
        }
-       unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                            ordered_extent->file_offset +
-                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
        if (ret < 0) {
                btrfs_abort_transaction(trans, root, ret);
-               goto out;
+               goto out_unlock;
        }
 
        add_pending_csums(trans, inode, ordered_extent->file_offset,
@@ -1905,10 +1905,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                ret = btrfs_update_inode_fallback(trans, root, inode);
                if (ret) { /* -ENOMEM or corruption */
                        btrfs_abort_transaction(trans, root, ret);
-                       goto out;
+                       goto out_unlock;
                }
        }
        ret = 0;
+out_unlock:
+       unlock_extent_cached(io_tree, ordered_extent->file_offset,
+                            ordered_extent->file_offset +
+                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
 out:
        if (root != root->fs_info->tree_root)
                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
@@ -1919,26 +1923,57 @@ out:
                        btrfs_end_transaction(trans, root);
        }
 
+       if (ret)
+               clear_extent_uptodate(io_tree, ordered_extent->file_offset,
+                                     ordered_extent->file_offset +
+                                     ordered_extent->len - 1, NULL, GFP_NOFS);
+
+       /*
+        * This needs to be dont to make sure anybody waiting knows we are done
+        * upating everything for this ordered extent.
+        */
+       btrfs_remove_ordered_extent(inode, ordered_extent);
+
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
        btrfs_put_ordered_extent(ordered_extent);
 
-       return 0;
-out_unlock:
-       unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                            ordered_extent->file_offset +
-                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
-       goto out;
+       return ret;
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+       struct btrfs_ordered_extent *ordered_extent;
+       ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+       btrfs_finish_ordered_io(ordered_extent);
 }
 
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
 {
+       struct inode *inode = page->mapping->host;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_ordered_extent *ordered_extent = NULL;
+       struct btrfs_workers *workers;
+
        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
 
        ClearPagePrivate2(page);
-       return btrfs_finish_ordered_io(page->mapping->host, start, end);
+       if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+                                           end - start + 1, uptodate))
+               return 0;
+
+       ordered_extent->work.func = finish_ordered_fn;
+       ordered_extent->work.flags = 0;
+
+       if (btrfs_is_free_space_inode(root, inode))
+               workers = &root->fs_info->endio_freespace_worker;
+       else
+               workers = &root->fs_info->endio_write_workers;
+       btrfs_queue_worker(workers, &ordered_extent->work);
+
+       return 0;
 }
 
 /*
@@ -2072,12 +2107,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *block_rsv;
        int ret;
 
-       if (!list_empty(&root->orphan_list) ||
+       if (atomic_read(&root->orphan_inodes) ||
            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
                return;
 
        spin_lock(&root->orphan_lock);
-       if (!list_empty(&root->orphan_list)) {
+       if (atomic_read(&root->orphan_inodes)) {
                spin_unlock(&root->orphan_lock);
                return;
        }
@@ -2134,8 +2169,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                block_rsv = NULL;
        }
 
-       if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+       if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                             &BTRFS_I(inode)->runtime_flags)) {
 #if 0
                /*
                 * For proper ENOSPC handling, we should do orphan
@@ -2148,12 +2183,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                        insert = 1;
 #endif
                insert = 1;
+               atomic_dec(&root->orphan_inodes);
        }
 
-       if (!BTRFS_I(inode)->orphan_meta_reserved) {
-               BTRFS_I(inode)->orphan_meta_reserved = 1;
+       if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+                             &BTRFS_I(inode)->runtime_flags))
                reserve = 1;
-       }
        spin_unlock(&root->orphan_lock);
 
        /* grab metadata reservation from transaction handle */
@@ -2166,6 +2201,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        if (insert >= 1) {
                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
                if (ret && ret != -EEXIST) {
+                       clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                                 &BTRFS_I(inode)->runtime_flags);
                        btrfs_abort_transaction(trans, root, ret);
                        return ret;
                }
@@ -2196,15 +2233,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
        int ret = 0;
 
        spin_lock(&root->orphan_lock);
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-               list_del_init(&BTRFS_I(inode)->i_orphan);
+       if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                              &BTRFS_I(inode)->runtime_flags))
                delete_item = 1;
-       }
 
-       if (BTRFS_I(inode)->orphan_meta_reserved) {
-               BTRFS_I(inode)->orphan_meta_reserved = 0;
+       if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+                              &BTRFS_I(inode)->runtime_flags))
                release_rsv = 1;
-       }
        spin_unlock(&root->orphan_lock);
 
        if (trans && delete_item) {
@@ -2212,8 +2247,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
                BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
        }
 
-       if (release_rsv)
+       if (release_rsv) {
                btrfs_orphan_release_metadata(inode);
+               atomic_dec(&root->orphan_inodes);
+       }
 
        return 0;
 }
@@ -2341,6 +2378,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                ret = PTR_ERR(trans);
                                goto out;
                        }
+                       printk(KERN_ERR "auto deleting %Lu\n",
+                              found_key.objectid);
                        ret = btrfs_del_orphan_item(trans, root,
                                                    found_key.objectid);
                        BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2352,9 +2391,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-               spin_lock(&root->orphan_lock);
-               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-               spin_unlock(&root->orphan_lock);
+               set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                       &BTRFS_I(inode)->runtime_flags);
 
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
@@ -2510,7 +2548,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
 
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
-       BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+       inode->i_version = btrfs_inode_sequence(leaf, inode_item);
        inode->i_generation = BTRFS_I(inode)->generation;
        inode->i_rdev = 0;
        rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2594,7 +2632,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
 
        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
        btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-       btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+       btrfs_set_inode_sequence(leaf, item, inode->i_version);
        btrfs_set_inode_transid(leaf, item, trans->transid);
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2752,6 +2790,8 @@ err:
                goto out;
 
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       inode_inc_iversion(inode);
+       inode_inc_iversion(dir);
        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        btrfs_update_inode(trans, root, dir);
 out:
@@ -3089,6 +3129,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        }
 
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       inode_inc_iversion(dir);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, dir);
        if (ret)
@@ -3607,7 +3648,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                 * any new writes get down to disk quickly.
                 */
                if (newsize == 0)
-                       BTRFS_I(inode)->ordered_data_close = 1;
+                       set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                               &BTRFS_I(inode)->runtime_flags);
 
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
@@ -3638,6 +3680,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 
        if (attr->ia_valid) {
                setattr_copy(inode, attr);
+               inode_inc_iversion(inode);
                err = btrfs_dirty_inode(inode);
 
                if (!err && attr->ia_valid & ATTR_MODE)
@@ -3671,7 +3714,8 @@ void btrfs_evict_inode(struct inode *inode)
        btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
        if (root->fs_info->log_root_recovering) {
-               BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+               BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                                &BTRFS_I(inode)->runtime_flags));
                goto no_delete;
        }
 
@@ -4066,7 +4110,7 @@ static struct inode *new_simple_dir(struct super_block *s,
 
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
-       BTRFS_I(inode)->dummy_inode = 1;
+       set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
 
        inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
        inode->i_op = &btrfs_dir_ro_inode_operations;
@@ -4370,7 +4414,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        int ret = 0;
        bool nolock = false;
 
-       if (BTRFS_I(inode)->dummy_inode)
+       if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
                return 0;
 
        if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@ -4403,7 +4447,7 @@ int btrfs_dirty_inode(struct inode *inode)
        struct btrfs_trans_handle *trans;
        int ret;
 
-       if (BTRFS_I(inode)->dummy_inode)
+       if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
                return 0;
 
        trans = btrfs_join_transaction(root);
@@ -4431,46 +4475,18 @@ int btrfs_dirty_inode(struct inode *inode)
  * This is a copy of file_update_time.  We need this so we can return error on
  * ENOSPC for updating the inode in the case of file write and mmap writes.
  */
-int btrfs_update_time(struct file *file)
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+                            int flags)
 {
-       struct inode *inode = file->f_path.dentry->d_inode;
-       struct timespec now;
-       int ret;
-       enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
-
-       /* First try to exhaust all avenues to not sync */
-       if (IS_NOCMTIME(inode))
-               return 0;
-
-       now = current_fs_time(inode->i_sb);
-       if (!timespec_equal(&inode->i_mtime, &now))
-               sync_it = S_MTIME;
-
-       if (!timespec_equal(&inode->i_ctime, &now))
-               sync_it |= S_CTIME;
-
-       if (IS_I_VERSION(inode))
-               sync_it |= S_VERSION;
-
-       if (!sync_it)
-               return 0;
-
-       /* Finally allowed to write? Takes lock. */
-       if (mnt_want_write_file(file))
-               return 0;
-
-       /* Only change inode inside the lock region */
-       if (sync_it & S_VERSION)
+       if (flags & S_VERSION)
                inode_inc_iversion(inode);
-       if (sync_it & S_CTIME)
-               inode->i_ctime = now;
-       if (sync_it & S_MTIME)
-               inode->i_mtime = now;
-       ret = btrfs_dirty_inode(inode);
-       if (!ret)
-               mark_inode_dirty_sync(inode);
-       mnt_drop_write(file->f_path.mnt);
-       return ret;
+       if (flags & S_CTIME)
+               inode->i_ctime = *now;
+       if (flags & S_MTIME)
+               inode->i_mtime = *now;
+       if (flags & S_ATIME)
+               inode->i_atime = *now;
+       return btrfs_dirty_inode(inode);
 }
 
 /*
@@ -4730,6 +4746,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 
        btrfs_i_size_write(parent_inode, parent_inode->i_size +
                           name_len * 2);
+       inode_inc_iversion(parent_inode);
        parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, parent_inode);
        if (ret)
@@ -4937,6 +4954,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        }
 
        btrfs_inc_nlink(inode);
+       inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
        ihold(inode);
 
@@ -5903,9 +5921,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
        struct btrfs_ordered_extent *ordered = NULL;
-       struct extent_state *cached_state = NULL;
        u64 ordered_offset = dip->logical_offset;
        u64 ordered_bytes = dip->bytes;
        int ret;
@@ -5915,73 +5931,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
 again:
        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
                                                   &ordered_offset,
-                                                  ordered_bytes);
+                                                  ordered_bytes, !err);
        if (!ret)
                goto out_test;
 
-       BUG_ON(!ordered);
-
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans)) {
-               err = -ENOMEM;
-               goto out;
-       }
-       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
-       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
-               ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-               if (!ret)
-                       err = btrfs_update_inode_fallback(trans, root, inode);
-               goto out;
-       }
-
-       lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-                        ordered->file_offset + ordered->len - 1, 0,
-                        &cached_state);
-
-       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-               ret = btrfs_mark_extent_written(trans, inode,
-                                               ordered->file_offset,
-                                               ordered->file_offset +
-                                               ordered->len);
-               if (ret) {
-                       err = ret;
-                       goto out_unlock;
-               }
-       } else {
-               ret = insert_reserved_file_extent(trans, inode,
-                                                 ordered->file_offset,
-                                                 ordered->start,
-                                                 ordered->disk_len,
-                                                 ordered->len,
-                                                 ordered->len,
-                                                 0, 0, 0,
-                                                 BTRFS_FILE_EXTENT_REG);
-               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-                                  ordered->file_offset, ordered->len);
-               if (ret) {
-                       err = ret;
-                       WARN_ON(1);
-                       goto out_unlock;
-               }
-       }
-
-       add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-       ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-       if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-               btrfs_update_inode_fallback(trans, root, inode);
-       ret = 0;
-out_unlock:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-                            ordered->file_offset + ordered->len - 1,
-                            &cached_state, GFP_NOFS);
-out:
-       btrfs_delalloc_release_metadata(inode, ordered->len);
-       btrfs_end_transaction(trans, root);
-       ordered_offset = ordered->file_offset + ordered->len;
-       btrfs_put_ordered_extent(ordered);
-       btrfs_put_ordered_extent(ordered);
-
+       ordered->work.func = finish_ordered_fn;
+       ordered->work.flags = 0;
+       btrfs_queue_worker(&root->fs_info->endio_write_workers,
+                          &ordered->work);
 out_test:
        /*
         * our bio might span multiple ordered extents.  If we haven't
@@ -5990,12 +5947,12 @@ out_test:
        if (ordered_offset < dip->logical_offset + dip->bytes) {
                ordered_bytes = dip->logical_offset + dip->bytes -
                        ordered_offset;
+               ordered = NULL;
                goto again;
        }
 out_done:
        bio->bi_private = dip->private;
 
-       kfree(dip->csums);
        kfree(dip);
 
        /* If we had an error make sure to clear the uptodate flag */
@@ -6063,9 +6020,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
        int ret;
 
        bio_get(bio);
-       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-       if (ret)
-               goto err;
+
+       if (!write) {
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+               if (ret)
+                       goto err;
+       }
 
        if (skip_sum)
                goto map;
@@ -6485,13 +6445,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
+       struct inode *inode = page->mapping->host;
        struct extent_io_tree *tree;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
        u64 page_start = page_offset(page);
        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-
        /*
         * we have the page locked, so new writeback can't start,
         * and the dirty bit won't be cleared while we are here.
@@ -6501,13 +6461,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
         */
        wait_on_page_writeback(page);
 
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
+       tree = &BTRFS_I(inode)->io_tree;
        if (offset) {
                btrfs_releasepage(page, GFP_NOFS);
                return;
        }
        lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-       ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+       ordered = btrfs_lookup_ordered_extent(inode,
                                           page_offset(page));
        if (ordered) {
                /*
@@ -6522,9 +6482,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                 * whoever cleared the private bit is responsible
                 * for the finish_ordered_io
                 */
-               if (TestClearPagePrivate2(page)) {
-                       btrfs_finish_ordered_io(page->mapping->host,
-                                               page_start, page_end);
+               if (TestClearPagePrivate2(page) &&
+                   btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
+                                                  PAGE_CACHE_SIZE, 1)) {
+                       btrfs_finish_ordered_io(ordered);
                }
                btrfs_put_ordered_extent(ordered);
                cached_state = NULL;
@@ -6576,7 +6537,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (!ret) {
-               ret = btrfs_update_time(vma->vm_file);
+               ret = file_update_time(vma->vm_file);
                reserved = 1;
        }
        if (ret) {
@@ -6771,7 +6732,8 @@ static int btrfs_truncate(struct inode *inode)
         * using truncate to replace the contents of the file will
         * end up with a zero length file after a crash.
         */
-       if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+       if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                                          &BTRFS_I(inode)->runtime_flags))
                btrfs_add_ordered_operation(trans, root, inode);
 
        while (1) {
@@ -6894,7 +6856,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->root = NULL;
        ei->space_info = NULL;
        ei->generation = 0;
-       ei->sequence = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
@@ -6909,11 +6870,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->outstanding_extents = 0;
        ei->reserved_extents = 0;
 
-       ei->ordered_data_close = 0;
-       ei->orphan_meta_reserved = 0;
-       ei->dummy_inode = 0;
-       ei->in_defrag = 0;
-       ei->delalloc_meta_reserved = 0;
+       ei->runtime_flags = 0;
        ei->force_compress = BTRFS_COMPRESS_NONE;
 
        ei->delayed_node = NULL;
@@ -6927,7 +6884,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        mutex_init(&ei->log_mutex);
        mutex_init(&ei->delalloc_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-       INIT_LIST_HEAD(&ei->i_orphan);
        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
        RB_CLEAR_NODE(&ei->rb_node);
@@ -6972,13 +6928,12 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
 
-       spin_lock(&root->orphan_lock);
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+       if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                    &BTRFS_I(inode)->runtime_flags)) {
                printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
                       (unsigned long long)btrfs_ino(inode));
-               list_del_init(&BTRFS_I(inode)->i_orphan);
+               atomic_dec(&root->orphan_inodes);
        }
-       spin_unlock(&root->orphan_lock);
 
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -7193,6 +7148,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
                btrfs_add_ordered_operation(trans, root, old_inode);
 
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
        old_dir->i_ctime = old_dir->i_mtime = ctime;
        new_dir->i_ctime = new_dir->i_mtime = ctime;
        old_inode->i_ctime = ctime;
@@ -7219,6 +7177,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        }
 
        if (new_inode) {
+               inode_inc_iversion(new_inode);
                new_inode->i_ctime = CURRENT_TIME;
                if (unlikely(btrfs_ino(new_inode) ==
                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7490,6 +7449,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                cur_offset += ins.offset;
                *alloc_hint = ins.objectid + ins.offset;
 
+               inode_inc_iversion(inode);
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -7647,6 +7607,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
        .permission     = btrfs_permission,
        .fiemap         = btrfs_fiemap,
        .get_acl        = btrfs_get_acl,
+       .update_time    = btrfs_update_time,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
        .getattr        = btrfs_getattr,
@@ -7657,6 +7618,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .get_acl        = btrfs_get_acl,
+       .update_time    = btrfs_update_time,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
@@ -7670,6 +7632,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .get_acl        = btrfs_get_acl,
+       .update_time    = btrfs_update_time,
 };
 
 const struct dentry_operations btrfs_dentry_operations = {
index 14f8e1faa46ee0478ebb83d6f82d205d25c1dc51..24b776c08d99f7bbb621076f68500464b6829435 100644 (file)
@@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        }
 
        btrfs_update_iflags(inode);
+       inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, inode);
 
@@ -367,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
                return PTR_ERR(trans);
 
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                     0, objectid, NULL, 0, 0, 0, 0);
+                                     0, objectid, NULL, 0, 0, 0);
        if (IS_ERR(leaf)) {
                ret = PTR_ERR(leaf);
                goto fail;
@@ -2262,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
        di_args->bytes_used = dev->bytes_used;
        di_args->total_bytes = dev->total_bytes;
        memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
-       if (dev->name)
+       if (dev->name) {
                strncpy(di_args->path, dev->name, sizeof(di_args->path));
-       else
+               di_args->path[sizeof(di_args->path) - 1] = 0;
+       } else {
                di_args->path[0] = '\0';
+       }
 
 out:
        if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
@@ -2622,6 +2625,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        btrfs_mark_buffer_dirty(leaf);
                        btrfs_release_path(path);
 
+                       inode_inc_iversion(inode);
                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
                        /*
@@ -2914,7 +2918,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                up_read(&info->groups_sem);
        }
 
-       user_dest = (struct btrfs_ioctl_space_info *)
+       user_dest = (struct btrfs_ioctl_space_info __user *)
                (arg + sizeof(struct btrfs_ioctl_space_args));
 
        if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -3042,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
        return ret;
 }
 
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+                                     void __user *arg, int reset_after_read)
+{
+       struct btrfs_ioctl_get_dev_stats *sa;
+       int ret;
+
+       if (reset_after_read && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+
+       ret = btrfs_get_dev_stats(root, sa, reset_after_read);
+
+       if (copy_to_user(arg, sa, sizeof(*sa)))
+               ret = -EFAULT;
+
+       kfree(sa);
+       return ret;
+}
+
 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 {
        int ret = 0;
@@ -3212,8 +3238,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
        }
 }
 
-static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
 {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_ioctl_balance_args *bargs;
        struct btrfs_balance_control *bctl;
@@ -3225,6 +3252,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
        if (fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
 
+       ret = mnt_want_write(file->f_path.mnt);
+       if (ret)
+               return ret;
+
        mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
 
@@ -3291,6 +3322,7 @@ out_bargs:
 out:
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
+       mnt_drop_write(file->f_path.mnt);
        return ret;
 }
 
@@ -3386,7 +3418,7 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_DEV_INFO:
                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
-               return btrfs_ioctl_balance(root, NULL);
+               return btrfs_ioctl_balance(file, NULL);
        case BTRFS_IOC_CLONE:
                return btrfs_ioctl_clone(file, arg, 0, 0, 0);
        case BTRFS_IOC_CLONE_RANGE:
@@ -3419,11 +3451,15 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_SCRUB_PROGRESS:
                return btrfs_ioctl_scrub_progress(root, argp);
        case BTRFS_IOC_BALANCE_V2:
-               return btrfs_ioctl_balance(root, argp);
+               return btrfs_ioctl_balance(file, argp);
        case BTRFS_IOC_BALANCE_CTL:
                return btrfs_ioctl_balance_ctl(root, arg);
        case BTRFS_IOC_BALANCE_PROGRESS:
                return btrfs_ioctl_balance_progress(root, argp);
+       case BTRFS_IOC_GET_DEV_STATS:
+               return btrfs_ioctl_get_dev_stats(root, argp, 0);
+       case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
+               return btrfs_ioctl_get_dev_stats(root, argp, 1);
        }
 
        return -ENOTTY;
index 086e6bdae1c4482b93b6dda4d16b1c5af288f2eb..497c530724cf6b7a50296d2c6660fef7f4066cb9 100644 (file)
@@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {
        __u64                           inodes;
 };
 
+enum btrfs_dev_stat_values {
+       /* disk I/O failure stats */
+       BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
+       BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
+       BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
+
+       /* stats for indirect indications for I/O failures */
+       BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
+                                        * contents is illegal: this is an
+                                        * indication that the block was damaged
+                                        * during read or write, or written to
+                                        * wrong location or read from wrong
+                                        * location */
+       BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
+                                        * been written */
+
+       BTRFS_DEV_STAT_VALUES_MAX
+};
+
+struct btrfs_ioctl_get_dev_stats {
+       __u64 devid;                            /* in */
+       __u64 nr_items;                         /* in/out */
+
+       /* out values: */
+       __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
+
+       __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {
                                        struct btrfs_ioctl_ino_path_args)
 #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
                                        struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
+                                     struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+                                       struct btrfs_ioctl_get_dev_stats)
 
 #endif
index bbf6d0d9aebe9b68f0ea8e5c121783d81733f7d7..9e138cdc36c5eb7d66bf80dfc37829878eeaa6e2 100644 (file)
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->len = len;
        entry->disk_len = disk_len;
        entry->bytes_left = len;
-       entry->inode = inode;
+       entry->inode = igrab(inode);
        entry->compress_type = compress_type;
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 
        trace_btrfs_ordered_extent_add(inode, entry);
 
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
        node = tree_insert(&tree->tree, file_offset,
                           &entry->rb_node);
        if (node)
                ordered_data_tree_panic(inode, -EEXIST, file_offset);
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
 
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
        struct btrfs_ordered_inode_tree *tree;
 
        tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
        list_add_tail(&sum->list, &entry->list);
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
 }
 
 /*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
  */
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
-                                  u64 *file_offset, u64 io_size)
+                                  u64 *file_offset, u64 io_size, int uptodate)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;
        int ret;
+       unsigned long flags;
        u64 dec_end;
        u64 dec_start;
        u64 to_dec;
 
        tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irqsave(&tree->lock, flags);
        node = tree_search(tree, *file_offset);
        if (!node) {
                ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
                       (unsigned long long)to_dec);
        }
        entry->bytes_left -= to_dec;
+       if (!uptodate)
+               set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
        if (entry->bytes_left == 0)
                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
        else
@@ -332,7 +336,7 @@ out:
                *cached = entry;
                atomic_inc(&entry->refs);
        }
-       spin_unlock(&tree->lock);
+       spin_unlock_irqrestore(&tree->lock, flags);
        return ret == 0;
 }
 
@@ -347,15 +351,21 @@ out:
  */
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
-                                  u64 file_offset, u64 io_size)
+                                  u64 file_offset, u64 io_size, int uptodate)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
        struct btrfs_ordered_extent *entry = NULL;
+       unsigned long flags;
        int ret;
 
        tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irqsave(&tree->lock, flags);
+       if (cached && *cached) {
+               entry = *cached;
+               goto have_entry;
+       }
+
        node = tree_search(tree, file_offset);
        if (!node) {
                ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
        }
 
        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
        if (!offset_in_entry(entry, file_offset)) {
                ret = 1;
                goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                       (unsigned long long)io_size);
        }
        entry->bytes_left -= io_size;
+       if (!uptodate)
+               set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
        if (entry->bytes_left == 0)
                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
        else
@@ -383,7 +397,7 @@ out:
                *cached = entry;
                atomic_inc(&entry->refs);
        }
-       spin_unlock(&tree->lock);
+       spin_unlock_irqrestore(&tree->lock, flags);
        return ret == 0;
 }
 
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
        trace_btrfs_ordered_extent_put(entry->inode, entry);
 
        if (atomic_dec_and_test(&entry->refs)) {
+               if (entry->inode)
+                       btrfs_add_delayed_iput(entry->inode);
                while (!list_empty(&entry->list)) {
                        cur = entry->list.next;
                        sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 
 /*
  * remove an ordered extent from the tree.  No references are dropped
- * and you must wake_up entry->wait.  You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
  */
-static void __btrfs_remove_ordered_extent(struct inode *inode,
-                                         struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct rb_node *node;
 
        tree = &BTRFS_I(inode)->ordered_tree;
+       spin_lock_irq(&tree->lock);
        node = &entry->rb_node;
        rb_erase(node, &tree->tree);
        tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+       spin_unlock_irq(&tree->lock);
 
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
                list_del_init(&BTRFS_I(inode)->ordered_operations);
        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
-}
-
-/*
- * remove an ordered extent from the tree.  No references are dropped
- * but any waiters are woken.
- */
-void btrfs_remove_ordered_extent(struct inode *inode,
-                                struct btrfs_ordered_extent *entry)
-{
-       struct btrfs_ordered_inode_tree *tree;
-
-       tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
-       __btrfs_remove_ordered_extent(inode, entry);
-       spin_unlock(&tree->lock);
        wake_up(&entry->wait);
 }
 
@@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                if (orig_end > INT_LIMIT(loff_t))
                        orig_end = INT_LIMIT(loff_t);
        }
-again:
+
        /* start IO across the range first to instantiate any delalloc
         * extents
         */
-       filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-       /* The compression code will leave pages locked but return from
-        * writepage without setting the page writeback.  Starting again
-        * with WB_SYNC_ALL will end up waiting for the IO to actually start.
-        */
-       filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-       filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+       filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
 
        end = orig_end;
        found = 0;
@@ -657,11 +651,6 @@ again:
                        break;
                end--;
        }
-       if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-                          EXTENT_DELALLOC, 0, NULL)) {
-               schedule_timeout(1);
-               goto again;
-       }
 }
 
 /*
@@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
        struct btrfs_ordered_extent *entry = NULL;
 
        tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
        node = tree_search(tree, file_offset);
        if (!node)
                goto out;
@@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
        if (entry)
                atomic_inc(&entry->refs);
 out:
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
        return entry;
 }
 
@@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
        struct btrfs_ordered_extent *entry = NULL;
 
        tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
        node = tree_search(tree, file_offset);
        if (!node) {
                node = tree_search(tree, file_offset + len);
@@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 out:
        if (entry)
                atomic_inc(&entry->refs);
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
        return entry;
 }
 
@@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
        struct btrfs_ordered_extent *entry = NULL;
 
        tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
        node = tree_search(tree, file_offset);
        if (!node)
                goto out;
@@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
        atomic_inc(&entry->refs);
 out:
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
        return entry;
 }
 
@@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered)
 {
        struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        u64 disk_i_size;
        u64 new_i_size;
        u64 i_size_test;
@@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
        else
                offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
 
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
        disk_i_size = BTRFS_I(inode)->disk_i_size;
 
        /* truncate file */
@@ -797,14 +785,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                goto out;
        }
 
-       /*
-        * we can't update the disk_isize if there are delalloc bytes
-        * between disk_i_size and  this ordered extent
-        */
-       if (test_range_bit(io_tree, disk_i_size, offset - 1,
-                          EXTENT_DELALLOC, 0, NULL)) {
-               goto out;
-       }
        /*
         * walk backward from this ordered extent to disk_i_size.
         * if we find an ordered extent then we can't update disk i_size
@@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                }
                node = prev;
        }
-       while (node) {
+       for (; node; node = rb_prev(node)) {
                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+               /* We treat this entry as if it doesnt exist */
+               if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+                       continue;
                if (test->file_offset + test->len <= disk_i_size)
                        break;
                if (test->file_offset >= i_size)
                        break;
                if (test->file_offset >= disk_i_size)
                        goto out;
-               node = rb_prev(node);
        }
        new_i_size = min_t(u64, offset, i_size);
 
@@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                else
                        node = rb_first(&tree->tree);
        }
-       i_size_test = 0;
-       if (node) {
-               /*
-                * do we have an area where IO might have finished
-                * between our ordered extent and the next one.
-                */
+
+       /*
+        * We are looking for an area between our current extent and the next
+        * ordered extent to update the i_size to.  There are 3 cases here
+        *
+        * 1) We don't actually have anything and we can update to i_size.
+        * 2) We have stuff but they already did their i_size update so again we
+        * can just update to i_size.
+        * 3) We have an outstanding ordered extent so the most we can update
+        * our disk_i_size to is the start of the next offset.
+        */
+       i_size_test = i_size;
+       for (; node; node = rb_next(node)) {
                test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-               if (test->file_offset > offset)
+
+               if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+                       continue;
+               if (test->file_offset > offset) {
                        i_size_test = test->file_offset;
-       } else {
-               i_size_test = i_size;
+                       break;
+               }
        }
 
        /*
         * i_size_test is the end of a region after this ordered
-        * extent where there are no ordered extents.  As long as there
-        * are no delalloc bytes in this area, it is safe to update
-        * disk_i_size to the end of the region.
+        * extent where there are no ordered extents, we can safely set
+        * disk_i_size to this.
         */
-       if (i_size_test > offset &&
-           !test_range_bit(io_tree, offset, i_size_test - 1,
-                           EXTENT_DELALLOC, 0, NULL)) {
+       if (i_size_test > offset)
                new_i_size = min_t(u64, i_size_test, i_size);
-       }
        BTRFS_I(inode)->disk_i_size = new_i_size;
        ret = 0;
 out:
        /*
-        * we need to remove the ordered extent with the tree lock held
-        * so that other people calling this function don't find our fully
-        * processed ordered entry and skip updating the i_size
+        * We need to do this because we can't remove ordered extents until
+        * after the i_disk_size has been updated and then the inode has been
+        * updated to reflect the change, so we need to tell anybody who finds
+        * this ordered extent that we've already done all the real work, we
+        * just haven't completed all the other work.
         */
        if (ordered)
-               __btrfs_remove_ordered_extent(inode, ordered);
-       spin_unlock(&tree->lock);
-       if (ordered)
-               wake_up(&ordered->wait);
+               set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+       spin_unlock_irq(&tree->lock);
        return ret;
 }
 
@@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
        if (!ordered)
                return 1;
 
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
        list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
                if (disk_bytenr >= ordered_sum->bytenr) {
                        num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                }
        }
 out:
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
        btrfs_put_ordered_extent(ordered);
        return ret;
 }
index c355ad4dc1a66962d30557e9bbdc08ca9fc25da8..e03c560d299732cfe2114fe41d049b691a949e61 100644 (file)
@@ -74,6 +74,12 @@ struct btrfs_ordered_sum {
 
 #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+                                      * has done its due diligence in updating
+                                      * the isize. */
+
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
@@ -113,6 +119,8 @@ struct btrfs_ordered_extent {
 
        /* a per root list of all the pending ordered extents */
        struct list_head root_extent_list;
+
+       struct btrfs_work work;
 };
 
 
@@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
                                struct btrfs_ordered_extent *entry);
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
-                                  u64 file_offset, u64 io_size);
+                                  u64 file_offset, u64 io_size, int uptodate);
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
-                                  u64 *file_offset, u64 io_size);
+                                  u64 *file_offset, u64 io_size,
+                                  int uptodate);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
index f38e452486b8d12ba36589248579dc158981c3be..5e23684887eb8eb401594af69b1be7372f7188aa 100644 (file)
@@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                               btrfs_dev_extent_chunk_offset(l, dev_extent),
                               (unsigned long long)
                               btrfs_dev_extent_length(l, dev_extent));
+               case BTRFS_DEV_STATS_KEY:
+                       printk(KERN_INFO "\t\tdevice stats\n");
+                       break;
                };
        }
 }
index ac5d010858848d007e380d529476ad9eb4f6fb31..48a4882d8ad5955eaa0be2b940e35f0b3b2a7f6f 100644 (file)
@@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)
 {
        struct reada_machine_work *rmw;
        struct btrfs_fs_info *fs_info;
+       int old_ioprio;
 
        rmw = container_of(work, struct reada_machine_work, work);
        fs_info = rmw->fs_info;
 
        kfree(rmw);
 
+       old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+                                      task_nice_ioprio(current));
+       set_task_ioprio(current, BTRFS_IOPRIO_READA);
        __reada_start_machine(fs_info);
+       set_task_ioprio(current, old_ioprio);
 }
 
 static void __reada_start_machine(struct btrfs_fs_info *fs_info)
index 2f3d6f917fb3373c02335b6912fcba1006f5fabe..a38cfa4f251ec1065410f561188c4adf5868cea3 100644 (file)
@@ -50,7 +50,7 @@ struct scrub_dev;
 struct scrub_page {
        struct scrub_block      *sblock;
        struct page             *page;
-       struct block_device     *bdev;
+       struct btrfs_device     *dev;
        u64                     flags;  /* extent flags */
        u64                     generation;
        u64                     logical;
@@ -86,6 +86,7 @@ struct scrub_block {
                unsigned int    header_error:1;
                unsigned int    checksum_error:1;
                unsigned int    no_io_error_seen:1;
+               unsigned int    generation_error:1; /* also sets header_error */
        };
 };
 
@@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                sdev->stat.read_errors++;
                sdev->stat.uncorrectable_errors++;
                spin_unlock(&sdev->stat_lock);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
                goto out;
        }
 
@@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                sdev->stat.read_errors++;
                sdev->stat.uncorrectable_errors++;
                spin_unlock(&sdev->stat_lock);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
                goto out;
        }
        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
@@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                sdev->stat.read_errors++;
                sdev->stat.uncorrectable_errors++;
                spin_unlock(&sdev->stat_lock);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
                goto out;
        }
 
@@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                spin_unlock(&sdev->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("i/o error", sblock_to_check);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
        } else if (sblock_bad->checksum_error) {
                spin_lock(&sdev->stat_lock);
                sdev->stat.csum_errors++;
                spin_unlock(&sdev->stat_lock);
                if (__ratelimit(&_rs))
                        scrub_print_warning("checksum error", sblock_to_check);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_CORRUPTION_ERRS);
        } else if (sblock_bad->header_error) {
                spin_lock(&sdev->stat_lock);
                sdev->stat.verify_errors++;
@@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                if (__ratelimit(&_rs))
                        scrub_print_warning("checksum/header error",
                                            sblock_to_check);
+               if (sblock_bad->generation_error)
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_GENERATION_ERRS);
+               else
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
        }
 
        if (sdev->readonly)
@@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                        page = sblock->pagev + page_index;
                        page->logical = logical;
                        page->physical = bbio->stripes[mirror_index].physical;
-                       /* for missing devices, bdev is NULL */
-                       page->bdev = bbio->stripes[mirror_index].dev->bdev;
+                       /* for missing devices, dev->bdev is NULL */
+                       page->dev = bbio->stripes[mirror_index].dev;
                        page->mirror_num = mirror_index + 1;
                        page->page = alloc_page(GFP_NOFS);
                        if (!page->page) {
@@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                struct scrub_page *page = sblock->pagev + page_num;
                DECLARE_COMPLETION_ONSTACK(complete);
 
-               if (page->bdev == NULL) {
+               if (page->dev->bdev == NULL) {
                        page->io_error = 1;
                        sblock->no_io_error_seen = 0;
                        continue;
@@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                bio = bio_alloc(GFP_NOFS, 1);
                if (!bio)
                        return -EIO;
-               bio->bi_bdev = page->bdev;
+               bio->bi_bdev = page->dev->bdev;
                bio->bi_sector = page->physical >> 9;
                bio->bi_end_io = scrub_complete_bio_end_io;
                bio->bi_private = &complete;
@@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                h = (struct btrfs_header *)mapped_buffer;
 
                if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
-                   generation != le64_to_cpu(h->generation) ||
                    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
                    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
-                          BTRFS_UUID_SIZE))
+                          BTRFS_UUID_SIZE)) {
                        sblock->header_error = 1;
+               } else if (generation != le64_to_cpu(h->generation)) {
+                       sblock->header_error = 1;
+                       sblock->generation_error = 1;
+               }
                csum = h->csum;
        } else {
                if (!have_csum)
@@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                bio = bio_alloc(GFP_NOFS, 1);
                if (!bio)
                        return -EIO;
-               bio->bi_bdev = page_bad->bdev;
+               bio->bi_bdev = page_bad->dev->bdev;
                bio->bi_sector = page_bad->physical >> 9;
                bio->bi_end_io = scrub_complete_bio_end_io;
                bio->bi_private = &complete;
@@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 
                /* this will also unplug the queue */
                wait_for_completion(&complete);
+               if (!bio_flagged(bio, BIO_UPTODATE)) {
+                       btrfs_dev_stat_inc_and_print(page_bad->dev,
+                               BTRFS_DEV_STAT_WRITE_ERRS);
+                       bio_put(bio);
+                       return -EIO;
+               }
                bio_put(bio);
        }
 
@@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)
        u64 mapped_size;
        void *p;
        u32 crc = ~(u32)0;
-       int fail = 0;
+       int fail_gen = 0;
+       int fail_cor = 0;
        u64 len;
        int index;
 
@@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)
        memcpy(on_disk_csum, s->csum, sdev->csum_size);
 
        if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
-               ++fail;
+               ++fail_cor;
 
        if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
-               ++fail;
+               ++fail_gen;
 
        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
-               ++fail;
+               ++fail_cor;
 
        len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 
        btrfs_csum_final(crc, calculated_csum);
        if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
-               ++fail;
+               ++fail_cor;
 
-       if (fail) {
+       if (fail_cor + fail_gen) {
                /*
                 * if we find an error in a super block, we just report it.
                 * They will get written with the next transaction commit
@@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                spin_lock(&sdev->stat_lock);
                ++sdev->stat.super_errors;
                spin_unlock(&sdev->stat_lock);
+               if (fail_cor)
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
+               else
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_GENERATION_ERRS);
        }
 
-       return fail;
+       return fail_cor + fail_gen;
 }
 
 static void scrub_block_get(struct scrub_block *sblock)
@@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
                        return -ENOMEM;
                }
                spage->sblock = sblock;
-               spage->bdev = sdev->dev->bdev;
+               spage->dev = sdev->dev;
                spage->flags = flags;
                spage->generation = gen;
                spage->logical = logical;
index c5f8fca4195fca9eb3806ebfbccf52d03049691e..96eb9fef7bd279584cf4dd8b6ed42cc09e425c1d 100644 (file)
@@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
        va_start(args, fmt);
 
        if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
-               strncpy(lvl, fmt, 3);
+               memcpy(lvl, fmt, 3);
+               lvl[3] = '\0';
                fmt += 3;
                type = logtypes[fmt[1] - '0'];
        } else
@@ -435,11 +436,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_thread_pool:
                        intarg = 0;
                        match_int(&args[0], &intarg);
-                       if (intarg) {
+                       if (intarg)
                                info->thread_pool_size = intarg;
-                               printk(KERN_INFO "btrfs: thread pool %d\n",
-                                      info->thread_pool_size);
-                       }
                        break;
                case Opt_max_inline:
                        num = match_strdup(&args[0]);
@@ -769,7 +767,7 @@ static int btrfs_fill_super(struct super_block *sb,
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
        sb->s_flags |= MS_POSIXACL;
 #endif
-
+       sb->s_flags |= MS_I_VERSION;
        err = open_ctree(sb, fs_devices, (char *)data);
        if (err) {
                printk("btrfs: open_ctree failed\n");
@@ -925,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode)
  */
 static char *setup_root_args(char *args)
 {
-       unsigned copied = 0;
-       unsigned len = strlen(args) + 2;
-       char *pos;
-       char *ret;
+       unsigned len = strlen(args) + 2 + 1;
+       char *src, *dst, *buf;
 
        /*
-        * We need the same args as before, but minus
-        *
-        * subvol=a
-        *
-        * and add
-        *
-        * subvolid=0
+        * We need the same args as before, but with this substitution:
+        * s!subvol=[^,]+!subvolid=0!
         *
-        * which is a difference of 2 characters, so we allocate strlen(args) +
-        * 2 characters.
+        * Since the replacement string is up to 2 bytes longer than the
+        * original, allocate strlen(args) + 2 + 1 bytes.
         */
-       ret = kzalloc(len * sizeof(char), GFP_NOFS);
-       if (!ret)
-               return NULL;
-       pos = strstr(args, "subvol=");
 
+       src = strstr(args, "subvol=");
        /* This shouldn't happen, but just in case.. */
-       if (!pos) {
-               kfree(ret);
+       if (!src)
+               return NULL;
+
+       buf = dst = kmalloc(len, GFP_NOFS);
+       if (!buf)
                return NULL;
-       }
 
        /*
-        * The subvol=<> arg is not at the front of the string, copy everybody
-        * up to that into ret.
+        * If the subvol= arg is not at the start of the string,
+        * copy whatever precedes it into buf.
         */
-       if (pos != args) {
-               *pos = '\0';
-               strcpy(ret, args);
-               copied += strlen(args);
-               pos++;
+       if (src != args) {
+               *src++ = '\0';
+               strcpy(buf, args);
+               dst += strlen(args);
        }
 
-       strncpy(ret + copied, "subvolid=0", len - copied);
-
-       /* Length of subvolid=0 */
-       copied += 10;
+       strcpy(dst, "subvolid=0");
+       dst += strlen("subvolid=0");
 
        /*
-        * If there is no , after the subvol= option then we know there's no
-        * other options and we can just return.
+        * If there is a "," after the original subvol=... string,
+        * copy that suffix into our buffer.  Otherwise, we're done.
         */
-       pos = strchr(pos, ',');
-       if (!pos)
-               return ret;
+       src = strchr(src, ',');
+       if (src)
+               strcpy(dst, src);
 
-       /* Copy the rest of the arguments into our buffer */
-       strncpy(ret + copied, pos, len - copied);
-       copied += strlen(pos);
-
-       return ret;
+       return buf;
 }
 
 static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -1118,6 +1101,40 @@ error_fs_info:
        return ERR_PTR(error);
 }
 
+static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
+{
+       spin_lock_irq(&workers->lock);
+       workers->max_workers = new_limit;
+       spin_unlock_irq(&workers->lock);
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+                                    int new_pool_size, int old_pool_size)
+{
+       if (new_pool_size == old_pool_size)
+               return;
+
+       fs_info->thread_pool_size = new_pool_size;
+
+       printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+              old_pool_size, new_pool_size);
+
+       btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
+       btrfs_set_max_workers(&fs_info->workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
+       btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+}
+
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1137,6 +1154,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                goto restore;
        }
 
+       btrfs_resize_thread_pool(fs_info,
+               fs_info->thread_pool_size, old_thread_pool_size);
+
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                return 0;
 
@@ -1180,7 +1200,8 @@ restore:
        fs_info->compress_type = old_compress_type;
        fs_info->max_inline = old_max_inline;
        fs_info->alloc_start = old_alloc_start;
-       fs_info->thread_pool_size = old_thread_pool_size;
+       btrfs_resize_thread_pool(fs_info,
+               old_thread_pool_size, fs_info->thread_pool_size);
        fs_info->metadata_ratio = old_metadata_ratio;
        return ret;
 }
index 36422254ef6765c14290a2373fa6d83cf2d364d5..1791c6e3d83487d82c9ffe80ab0239976cfd1c96 100644 (file)
@@ -28,6 +28,7 @@
 #include "locking.h"
 #include "tree-log.h"
 #include "inode-map.h"
+#include "volumes.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
@@ -55,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 static noinline int join_transaction(struct btrfs_root *root, int nofail)
 {
        struct btrfs_transaction *cur_trans;
+       struct btrfs_fs_info *fs_info = root->fs_info;
 
-       spin_lock(&root->fs_info->trans_lock);
+       spin_lock(&fs_info->trans_lock);
 loop:
        /* The file system has been taken offline. No new transactions. */
-       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-               spin_unlock(&root->fs_info->trans_lock);
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               spin_unlock(&fs_info->trans_lock);
                return -EROFS;
        }
 
-       if (root->fs_info->trans_no_join) {
+       if (fs_info->trans_no_join) {
                if (!nofail) {
-                       spin_unlock(&root->fs_info->trans_lock);
+                       spin_unlock(&fs_info->trans_lock);
                        return -EBUSY;
                }
        }
 
-       cur_trans = root->fs_info->running_transaction;
+       cur_trans = fs_info->running_transaction;
        if (cur_trans) {
                if (cur_trans->aborted) {
-                       spin_unlock(&root->fs_info->trans_lock);
+                       spin_unlock(&fs_info->trans_lock);
                        return cur_trans->aborted;
                }
                atomic_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
-               spin_unlock(&root->fs_info->trans_lock);
+               spin_unlock(&fs_info->trans_lock);
                return 0;
        }
-       spin_unlock(&root->fs_info->trans_lock);
+       spin_unlock(&fs_info->trans_lock);
 
        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
        if (!cur_trans)
                return -ENOMEM;
 
-       spin_lock(&root->fs_info->trans_lock);
-       if (root->fs_info->running_transaction) {
+       spin_lock(&fs_info->trans_lock);
+       if (fs_info->running_transaction) {
                /*
                 * someone started a transaction after we unlocked.  Make sure
                 * to redo the trans_no_join checks above
                 */
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
-               cur_trans = root->fs_info->running_transaction;
+               cur_trans = fs_info->running_transaction;
                goto loop;
        }
 
@@ -121,20 +123,38 @@ loop:
        cur_trans->delayed_refs.flushing = 0;
        cur_trans->delayed_refs.run_delayed_start = 0;
        cur_trans->delayed_refs.seq = 1;
+
+       /*
+        * although the tree mod log is per file system and not per transaction,
+        * the log must never go across transaction boundaries.
+        */
+       smp_mb();
+       if (!list_empty(&fs_info->tree_mod_seq_list)) {
+               printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+                       "creating a fresh transaction\n");
+               WARN_ON(1);
+       }
+       if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
+               printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+                       "creating a fresh transaction\n");
+               WARN_ON(1);
+       }
+       atomic_set(&fs_info->tree_mod_seq, 0);
+
        init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
        INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
 
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-       list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+       list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
-                            root->fs_info->btree_inode->i_mapping);
-       root->fs_info->generation++;
-       cur_trans->transid = root->fs_info->generation;
-       root->fs_info->running_transaction = cur_trans;
+                            fs_info->btree_inode->i_mapping);
+       fs_info->generation++;
+       cur_trans->transid = fs_info->generation;
+       fs_info->running_transaction = cur_trans;
        cur_trans->aborted = 0;
-       spin_unlock(&root->fs_info->trans_lock);
+       spin_unlock(&fs_info->trans_lock);
 
        return 0;
 }
@@ -758,6 +778,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
 
+       ret = btrfs_run_dev_stats(trans, root->fs_info);
+       BUG_ON(ret);
+
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
index eb1ae908582cc51162a61798c80f3ed38e7ab6e8..2017d0ff511ca3304dad46e85045ad2ab28d4e75 100644 (file)
@@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
        int i;
        int ret;
 
-       btrfs_read_buffer(eb, gen);
+       ret = btrfs_read_buffer(eb, gen);
+       if (ret)
+               return ret;
 
        level = btrfs_header_level(eb);
 
@@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
                        path->slots[*level]++;
                        if (wc->free) {
-                               btrfs_read_buffer(next, ptr_gen);
+                               ret = btrfs_read_buffer(next, ptr_gen);
+                               if (ret) {
+                                       free_extent_buffer(next);
+                                       return ret;
+                               }
 
                                btrfs_tree_lock(next);
                                btrfs_set_lock_blocking(next);
@@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                        free_extent_buffer(next);
                        continue;
                }
-               btrfs_read_buffer(next, ptr_gen);
+               ret = btrfs_read_buffer(next, ptr_gen);
+               if (ret) {
+                       free_extent_buffer(next);
+                       return ret;
+               }
 
                WARN_ON(*level <= 0);
                if (path->nodes[*level-1])
@@ -2657,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                btrfs_release_path(path);
        }
        btrfs_release_path(path);
+       if (ret > 0)
+               ret = 0;
        return ret;
 }
 
@@ -3028,21 +3040,6 @@ out:
        return ret;
 }
 
-static int inode_in_log(struct btrfs_trans_handle *trans,
-                struct inode *inode)
-{
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret = 0;
-
-       mutex_lock(&root->log_mutex);
-       if (BTRFS_I(inode)->logged_trans == trans->transid &&
-           BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
-               ret = 1;
-       mutex_unlock(&root->log_mutex);
-       return ret;
-}
-
-
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
@@ -3083,7 +3080,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        if (ret)
                goto end_no_trans;
 
-       if (inode_in_log(trans, inode)) {
+       if (btrfs_inode_in_log(inode, trans->transid)) {
                ret = BTRFS_NO_LOG_SYNC;
                goto end_no_trans;
        }
index 12f5147bd2b1ae2a6016e7283c72ceccb44283b7..ab942f46b3dd81e06348c4950901f3e4eef87016 100644 (file)
@@ -23,9 +23,9 @@
  *
  * ulist = ulist_alloc();
  * ulist_add(ulist, root);
- * elem = NULL;
+ * ULIST_ITER_INIT(&uiter);
  *
- * while ((elem = ulist_next(ulist, elem)) {
+ * while ((elem = ulist_next(ulist, &uiter)) {
  *     for (all child nodes n in elem)
  *             ulist_add(ulist, n);
  *     do something useful with the node;
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);
  *
  * The allocated ulist will be returned in an initialized state.
  */
-struct ulist *ulist_alloc(unsigned long gfp_mask)
+struct ulist *ulist_alloc(gfp_t gfp_mask)
 {
        struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
 
@@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);
  * unaltered.
  */
 int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-             unsigned long gfp_mask)
+             gfp_t gfp_mask)
+{
+       return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+                   unsigned long *old_aux, gfp_t gfp_mask)
 {
        int i;
 
        for (i = 0; i < ulist->nnodes; ++i) {
-               if (ulist->nodes[i].val == val)
+               if (ulist->nodes[i].val == val) {
+                       if (old_aux)
+                               *old_aux = ulist->nodes[i].aux;
                        return 0;
+               }
        }
 
        if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
 /**
  * ulist_next - iterate ulist
  * @ulist:     ulist to iterate
- * @prev:      previously returned element or %NULL to start iteration
+ * @uiter:     iterator variable, initialized with ULIST_ITER_INIT(&iterator)
  *
  * Note: locking must be provided by the caller. In case of rwlocks only read
  *       locking is needed
  *
- * This function is used to iterate an ulist. The iteration is started with
- * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
  * end is reached. No guarantee is made with respect to the order in which
  * the elements are returned. They might neither be returned in order of
  * addition nor in ascending order.
  * It is allowed to call ulist_add during an enumeration. Newly added items
  * are guaranteed to show up in the running enumeration.
  */
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
 {
-       int next;
-
        if (ulist->nnodes == 0)
                return NULL;
-
-       if (!prev)
-               return &ulist->nodes[0];
-
-       next = (prev - ulist->nodes) + 1;
-       if (next < 0 || next >= ulist->nnodes)
+       if (uiter->i < 0 || uiter->i >= ulist->nnodes)
                return NULL;
 
-       return &ulist->nodes[next];
+       return &ulist->nodes[uiter->i++];
 }
 EXPORT_SYMBOL(ulist_next);
index 2e25dec58ec0e56251fbca880d27cc927aac95dc..21bdc8ec813046ac56e3c7db0739bcdba7ac188a 100644 (file)
  */
 #define ULIST_SIZE 16
 
+struct ulist_iterator {
+       int i;
+};
+
 /*
  * element of the list
  */
@@ -59,10 +63,15 @@ struct ulist {
 void ulist_init(struct ulist *ulist);
 void ulist_fini(struct ulist *ulist);
 void ulist_reinit(struct ulist *ulist);
-struct ulist *ulist_alloc(unsigned long gfp_mask);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
 void ulist_free(struct ulist *ulist);
 int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-             unsigned long gfp_mask);
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+             gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+                   unsigned long *old_aux, gfp_t gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist,
+                             struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
 
 #endif
index 1411b99555a4c1f138a6a3bf699842849d2b3e08..7782020996feccd4b7103528a4c2989230f79b71 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <asm/div64.h>
 #include "compat.h"
@@ -39,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
@@ -361,6 +364,7 @@ static noinline int device_list_add(const char *path,
                        return -ENOMEM;
                }
                device->devid = devid;
+               device->dev_stats_valid = 0;
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
@@ -1633,7 +1637,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        int ret = 0;
 
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
-               return -EINVAL;
+               return -EROFS;
 
        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
                                  root->fs_info->bdev_holder);
@@ -4001,13 +4005,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        return 0;
 }
 
+static void *merge_stripe_index_into_bio_private(void *bi_private,
+                                                unsigned int stripe_index)
+{
+       /*
+        * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
+        * at most 1.
+        * The alternative solution (instead of stealing bits from the
+        * pointer) would be to allocate an intermediate structure
+        * that contains the old private pointer plus the stripe_index.
+        */
+       BUG_ON((((uintptr_t)bi_private) & 3) != 0);
+       BUG_ON(stripe_index > 3);
+       return (void *)(((uintptr_t)bi_private) | stripe_index);
+}
+
+static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
+{
+       return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
+}
+
+static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
+{
+       return (unsigned int)((uintptr_t)bi_private) & 3;
+}
+
 static void btrfs_end_bio(struct bio *bio, int err)
 {
-       struct btrfs_bio *bbio = bio->bi_private;
+       struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
        int is_orig_bio = 0;
 
-       if (err)
+       if (err) {
                atomic_inc(&bbio->error);
+               if (err == -EIO || err == -EREMOTEIO) {
+                       unsigned int stripe_index =
+                               extract_stripe_index_from_bio_private(
+                                       bio->bi_private);
+                       struct btrfs_device *dev;
+
+                       BUG_ON(stripe_index >= bbio->num_stripes);
+                       dev = bbio->stripes[stripe_index].dev;
+                       if (bio->bi_rw & WRITE)
+                               btrfs_dev_stat_inc(dev,
+                                                  BTRFS_DEV_STAT_WRITE_ERRS);
+                       else
+                               btrfs_dev_stat_inc(dev,
+                                                  BTRFS_DEV_STAT_READ_ERRS);
+                       if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+                               btrfs_dev_stat_inc(dev,
+                                                  BTRFS_DEV_STAT_FLUSH_ERRS);
+                       btrfs_dev_stat_print_on_error(dev);
+               }
+       }
 
        if (bio == bbio->orig_bio)
                is_orig_bio = 1;
@@ -4149,6 +4198,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                        bio = first_bio;
                }
                bio->bi_private = bbio;
+               bio->bi_private = merge_stripe_index_into_bio_private(
+                               bio->bi_private, (unsigned int)dev_nr);
                bio->bi_end_io = btrfs_end_bio;
                bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
                dev = bbio->stripes[dev_nr].dev;
@@ -4509,6 +4560,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        return ret;
 }
 
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+                                                  u64 logical, int mirror_num)
+{
+       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+       int ret;
+       u64 map_length = 0;
+       struct btrfs_bio *bbio = NULL;
+       struct btrfs_device *device;
+
+       BUG_ON(mirror_num == 0);
+       ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
+                             mirror_num);
+       if (ret) {
+               BUG_ON(bbio != NULL);
+               return NULL;
+       }
+       BUG_ON(mirror_num != bbio->mirror_num);
+       device = bbio->stripes[mirror_num - 1].dev;
+       kfree(bbio);
+       return device;
+}
+
 int btrfs_read_chunk_tree(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -4583,3 +4656,230 @@ error:
        btrfs_free_path(path);
        return ret;
 }
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+       int i;
+
+       for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+               btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct extent_buffer *eb;
+       int slot;
+       int ret = 0;
+       struct btrfs_device *device;
+       struct btrfs_path *path = NULL;
+       int i;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               int item_size;
+               struct btrfs_dev_stats_item *ptr;
+
+               key.objectid = 0;
+               key.type = BTRFS_DEV_STATS_KEY;
+               key.offset = device->devid;
+               ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+               if (ret) {
+                       printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
+                              device->name, (unsigned long long)device->devid);
+                       __btrfs_reset_dev_stats(device);
+                       device->dev_stats_valid = 1;
+                       btrfs_release_path(path);
+                       continue;
+               }
+               slot = path->slots[0];
+               eb = path->nodes[0];
+               btrfs_item_key_to_cpu(eb, &found_key, slot);
+               item_size = btrfs_item_size_nr(eb, slot);
+
+               ptr = btrfs_item_ptr(eb, slot,
+                                    struct btrfs_dev_stats_item);
+
+               for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+                       if (item_size >= (1 + i) * sizeof(__le64))
+                               btrfs_dev_stat_set(device, i,
+                                       btrfs_dev_stats_value(eb, ptr, i));
+                       else
+                               btrfs_dev_stat_reset(device, i);
+               }
+
+               device->dev_stats_valid = 1;
+               btrfs_dev_stat_print_on_load(device);
+               btrfs_release_path(path);
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+       btrfs_free_path(path);
+       return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *dev_root,
+                               struct btrfs_device *device)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *eb;
+       struct btrfs_dev_stats_item *ptr;
+       int ret;
+       int i;
+
+       key.objectid = 0;
+       key.type = BTRFS_DEV_STATS_KEY;
+       key.offset = device->devid;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+       if (ret < 0) {
+               printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+                      ret, device->name);
+               goto out;
+       }
+
+       if (ret == 0 &&
+           btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+               /* need to delete old one and insert a new one */
+               ret = btrfs_del_item(trans, dev_root, path);
+               if (ret != 0) {
+                       printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+                              device->name, ret);
+                       goto out;
+               }
+               ret = 1;
+       }
+
+       if (ret == 1) {
+               /* need to insert a new item */
+               btrfs_release_path(path);
+               ret = btrfs_insert_empty_item(trans, dev_root, path,
+                                             &key, sizeof(*ptr));
+               if (ret < 0) {
+                       printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+                              device->name, ret);
+                       goto out;
+               }
+       }
+
+       eb = path->nodes[0];
+       ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+       for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+               btrfs_set_dev_stats_value(eb, ptr, i,
+                                         btrfs_dev_stat_read(device, i));
+       btrfs_mark_buffer_dirty(eb);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       int ret = 0;
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               if (!device->dev_stats_valid || !device->dev_stats_dirty)
+                       continue;
+
+               ret = update_dev_stat_item(trans, dev_root, device);
+               if (!ret)
+                       device->dev_stats_dirty = 0;
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+       return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+       btrfs_dev_stat_inc(dev, index);
+       btrfs_dev_stat_print_on_error(dev);
+}
+
+void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+       if (!dev->dev_stats_valid)
+               return;
+       printk_ratelimited(KERN_ERR
+                          "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+                          dev->name,
+                          btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+                          btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+                          btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+                          btrfs_dev_stat_read(dev,
+                                              BTRFS_DEV_STAT_CORRUPTION_ERRS),
+                          btrfs_dev_stat_read(dev,
+                                              BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+       printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+              dev->name,
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+                       struct btrfs_ioctl_get_dev_stats *stats,
+                       int reset_after_read)
+{
+       struct btrfs_device *dev;
+       struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+       int i;
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+       if (!dev) {
+               printk(KERN_WARNING
+                      "btrfs: get dev_stats failed, device not found\n");
+               return -ENODEV;
+       } else if (!dev->dev_stats_valid) {
+               printk(KERN_WARNING
+                      "btrfs: get dev_stats failed, not yet valid\n");
+               return -ENODEV;
+       } else if (reset_after_read) {
+               for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+                       if (stats->nr_items > i)
+                               stats->values[i] =
+                                       btrfs_dev_stat_read_and_reset(dev, i);
+                       else
+                               btrfs_dev_stat_reset(dev, i);
+               }
+       } else {
+               for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+                       if (stats->nr_items > i)
+                               stats->values[i] = btrfs_dev_stat_read(dev, i);
+       }
+       if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+               stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+       return 0;
+}
index bb6b03f97aaa089793d667fae93335373773a7eb..3406a88ca83e023429b8af19f2d6aa64d4cac6f8 100644 (file)
@@ -22,6 +22,7 @@
 #include <linux/bio.h>
 #include <linux/sort.h>
 #include "async-thread.h"
+#include "ioctl.h"
 
 #define BTRFS_STRIPE_LEN       (64 * 1024)
 
@@ -106,6 +107,11 @@ struct btrfs_device {
        struct completion flush_wait;
        int nobarriers;
 
+       /* disk I/O failure stats. For detailed description refer to
+        * enum btrfs_dev_stat_values in ioctl.h */
+       int dev_stats_valid;
+       int dev_stats_dirty; /* counters need to be written to disk */
+       atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
 };
 
 struct btrfs_fs_devices {
@@ -281,4 +287,50 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                         u64 *start, u64 *max_avail);
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+                                                  u64 logical, int mirror_num);
+void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+                       struct btrfs_ioctl_get_dev_stats *stats,
+                       int reset_after_read);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info);
+
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+                                     int index)
+{
+       atomic_inc(dev->dev_stat_values + index);
+       dev->dev_stats_dirty = 1;
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+                                     int index)
+{
+       return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+                                               int index)
+{
+       int ret;
+
+       ret = atomic_xchg(dev->dev_stat_values + index, 0);
+       dev->dev_stats_dirty = 1;
+       return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+                                     int index, unsigned long val)
+{
+       atomic_set(dev->dev_stat_values + index, val);
+       dev->dev_stats_dirty = 1;
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+                                       int index)
+{
+       btrfs_dev_stat_set(dev, index, 0);
+}
 #endif
index e7a5659087e66f93769bc750562d21294c9bd2b6..3f4e2d69e83a13cb66f3f3a56024f53f5299f5c4 100644 (file)
@@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (ret)
                goto out;
 
+       inode_inc_iversion(inode);
        inode->i_ctime = CURRENT_TIME;
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
index ad5938ca357c270ace08388401176f22a6343571..838a9cf246bd0fa561ab66295f9bb3df77e0c6a2 100644 (file)
@@ -3152,7 +3152,7 @@ SYSCALL_DEFINE2(bdflush, int, func, long, data)
 /*
  * Buffer-head allocation
  */
-static struct kmem_cache *bh_cachep;
+static struct kmem_cache *bh_cachep __read_mostly;
 
 /*
  * Once the number of bh's in the machine exceeds this level, we start
index fbb2a643ef10a1f75c4918f165c9e3a22a603a86..8e1b60e557b65bea0df86a881376456658a9cffd 100644 (file)
@@ -40,38 +40,49 @@ struct ceph_nfs_confh {
        u32 parent_name_hash;
 } __attribute__ ((packed));
 
-static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
-                         int connectable)
+/*
+ * The presence of @parent_inode here tells us whether NFS wants a
+ * connectable file handle.  However, we want to make a connectionable
+ * file handle unconditionally so that the MDS gets as much of a hint
+ * as possible.  That means we only use @parent_dentry to indicate
+ * whether nfsd wants a connectable fh, and whether we should indicate
+ * failure from a too-small @max_len.
+ */
+static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
+                         struct inode *parent_inode)
 {
        int type;
        struct ceph_nfs_fh *fh = (void *)rawfh;
        struct ceph_nfs_confh *cfh = (void *)rawfh;
-       struct dentry *parent;
-       struct inode *inode = dentry->d_inode;
        int connected_handle_length = sizeof(*cfh)/4;
        int handle_length = sizeof(*fh)/4;
+       struct dentry *dentry = d_find_alias(inode);
+       struct dentry *parent;
 
        /* don't re-export snaps */
        if (ceph_snap(inode) != CEPH_NOSNAP)
                return -EINVAL;
 
-       spin_lock(&dentry->d_lock);
-       parent = dentry->d_parent;
-       if (*max_len >= connected_handle_length) {
+       /* if we found an alias, generate a connectable fh */
+       if (*max_len >= connected_handle_length && dentry) {
                dout("encode_fh %p connectable\n", dentry);
-               cfh->ino = ceph_ino(dentry->d_inode);
+               spin_lock(&dentry->d_lock);
+               parent = dentry->d_parent;
+               cfh->ino = ceph_ino(inode);
                cfh->parent_ino = ceph_ino(parent->d_inode);
                cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
                                                         dentry);
                *max_len = connected_handle_length;
                type = 2;
+               spin_unlock(&dentry->d_lock);
        } else if (*max_len >= handle_length) {
-               if (connectable) {
+               if (parent_inode) {
+                       /* nfsd wants connectable */
                        *max_len = connected_handle_length;
                        type = 255;
                } else {
                        dout("encode_fh %p\n", dentry);
-                       fh->ino = ceph_ino(dentry->d_inode);
+                       fh->ino = ceph_ino(inode);
                        *max_len = handle_length;
                        type = 1;
                }
@@ -79,7 +90,6 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                *max_len = handle_length;
                type = 255;
        }
-       spin_unlock(&dentry->d_lock);
        return type;
 }
 
index 3adf3d4c2cd9e1a5da0da2bf9a6b984a0d276edf..6161255fac45648efdfe437d9d880d390268d14f 100644 (file)
@@ -871,12 +871,12 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
 {
        int error;
        struct file *file;
+       int fput_needed;
        struct compat_readdir_callback buf;
 
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
        if (!file)
-               goto out;
+               return -EBADF;
 
        buf.result = 0;
        buf.dirent = dirent;
@@ -885,8 +885,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
        if (buf.result)
                error = buf.result;
 
-       fput(file);
-out:
+       fput_light(file, fput_needed);
        return error;
 }
 
@@ -953,16 +952,15 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
        struct file * file;
        struct compat_linux_dirent __user * lastdirent;
        struct compat_getdents_callback buf;
+       int fput_needed;
        int error;
 
-       error = -EFAULT;
        if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
 
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
        if (!file)
-               goto out;
+               return -EBADF;
 
        buf.current_dir = dirent;
        buf.previous = NULL;
@@ -979,8 +977,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
                else
                        error = count - buf.count;
        }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
        return error;
 }
 
@@ -1041,16 +1038,15 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
        struct file * file;
        struct linux_dirent64 __user * lastdirent;
        struct compat_getdents_callback64 buf;
+       int fput_needed;
        int error;
 
-       error = -EFAULT;
        if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
 
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
        if (!file)
-               goto out;
+               return -EBADF;
 
        buf.current_dir = dirent;
        buf.previous = NULL;
@@ -1068,8 +1064,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
                else
                        error = count - buf.count;
        }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
        return error;
 }
 #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
index 4435d8b329044da3b48c83dfe555409464797d0d..85c9e2bff8e65126eaca755e14d1ee4a15a27170 100644 (file)
@@ -683,8 +683,6 @@ EXPORT_SYMBOL(dget_parent);
 /**
  * d_find_alias - grab a hashed alias of inode
  * @inode: inode in question
- * @want_discon:  flag, used by d_splice_alias, to request
- *          that only a DISCONNECTED alias be returned.
  *
  * If inode has a hashed alias, or is a directory and has any alias,
  * acquire the reference to alias and return it. Otherwise return NULL.
@@ -693,10 +691,9 @@ EXPORT_SYMBOL(dget_parent);
  * of a filesystem.
  *
  * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ * any other hashed alias over that.
  */
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
 {
        struct dentry *alias, *discon_alias;
 
@@ -708,7 +705,7 @@ again:
                        if (IS_ROOT(alias) &&
                            (alias->d_flags & DCACHE_DISCONNECTED)) {
                                discon_alias = alias;
-                       } else if (!want_discon) {
+                       } else {
                                __dget_dlock(alias);
                                spin_unlock(&alias->d_lock);
                                return alias;
@@ -739,7 +736,7 @@ struct dentry *d_find_alias(struct inode *inode)
 
        if (!list_empty(&inode->i_dentry)) {
                spin_lock(&inode->i_lock);
-               de = __d_find_alias(inode, 0);
+               de = __d_find_alias(inode);
                spin_unlock(&inode->i_lock);
        }
        return de;
@@ -1650,9 +1647,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 
        if (inode && S_ISDIR(inode->i_mode)) {
                spin_lock(&inode->i_lock);
-               new = __d_find_alias(inode, 1);
+               new = __d_find_any_alias(inode);
                if (new) {
-                       BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
                        spin_unlock(&inode->i_lock);
                        security_d_instantiate(new, inode);
                        d_move(new, dentry);
@@ -2482,7 +2478,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                struct dentry *alias;
 
                /* Does an aliased dentry already exist? */
-               alias = __d_find_alias(inode, 0);
+               alias = __d_find_alias(inode);
                if (alias) {
                        actual = alias;
                        write_seqlock(&rename_lock);
@@ -2575,7 +2571,7 @@ static int prepend_path(const struct path *path,
        bool slash = false;
        int error = 0;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        while (dentry != root->dentry || vfsmnt != root->mnt) {
                struct dentry * parent;
 
@@ -2606,7 +2602,7 @@ static int prepend_path(const struct path *path,
                error = prepend(buffer, buflen, "/", 1);
 
 out:
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        return error;
 
 global_root:
index ab35b113003b900ad3592217d64e6cef82fe8f9d..a07441a0a8789a9ee1e43f5be0d2b43ec3ee04e8 100644 (file)
@@ -660,11 +660,10 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
 {
        struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
        char *lower_buf;
-       size_t lower_bufsiz = PATH_MAX;
        mm_segment_t old_fs;
        int rc;
 
-       lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
+       lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!lower_buf) {
                rc = -ENOMEM;
                goto out;
@@ -673,58 +672,29 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
        set_fs(get_ds());
        rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
                                                   (char __user *)lower_buf,
-                                                  lower_bufsiz);
+                                                  PATH_MAX);
        set_fs(old_fs);
        if (rc < 0)
                goto out;
-       lower_bufsiz = rc;
        rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
-                                                 lower_buf, lower_bufsiz);
+                                                 lower_buf, rc);
 out:
        kfree(lower_buf);
        return rc;
 }
 
-static int
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
-       char *kbuf;
-       size_t kbufsiz, copied;
+       char *buf;
+       size_t len = PATH_MAX;
        int rc;
 
-       rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+       rc = ecryptfs_readlink_lower(dentry, &buf, &len);
        if (rc)
                goto out;
-       copied = min_t(size_t, bufsiz, kbufsiz);
-       rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
-       kfree(kbuf);
        fsstack_copy_attr_atime(dentry->d_inode,
                                ecryptfs_dentry_to_lower(dentry)->d_inode);
-out:
-       return rc;
-}
-
-static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       char *buf;
-       int len = PAGE_SIZE, rc;
-       mm_segment_t old_fs;
-
-       /* Released in ecryptfs_put_link(); only release here on error */
-       buf = kmalloc(len, GFP_KERNEL);
-       if (!buf) {
-               buf = ERR_PTR(-ENOMEM);
-               goto out;
-       }
-       old_fs = get_fs();
-       set_fs(get_ds());
-       rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
-       set_fs(old_fs);
-       if (rc < 0) {
-               kfree(buf);
-               buf = ERR_PTR(rc);
-       } else
-               buf[rc] = '\0';
+       buf[len] = '\0';
 out:
        nd_set_link(nd, buf);
        return NULL;
@@ -1153,7 +1123,7 @@ out:
 }
 
 const struct inode_operations ecryptfs_symlink_iops = {
-       .readlink = ecryptfs_readlink,
+       .readlink = generic_readlink,
        .follow_link = ecryptfs_follow_link,
        .put_link = ecryptfs_put_link,
        .permission = ecryptfs_permission,
index 52c9e2ff6e6bd8b6f763e56ceafda431731cea9b..a79786a8d2c88d5b6c580859ef12496f43b4b0f4 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -280,10 +280,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
 
-       err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
-       if (err)
-               goto err;
-
        err = insert_vm_struct(mm, vma);
        if (err)
                goto err;
index b05acb7961355dfb680e49f3145a11065f6ac851..b0201ca6e9c6e0b7837917420bb3dfe1dc06b88f 100644 (file)
@@ -304,24 +304,23 @@ out:
 
 /**
  * export_encode_fh - default export_operations->encode_fh function
- * @dentry:  the dentry to encode
+ * @inode:   the object to encode
  * @fh:      where to store the file handle fragment
  * @max_len: maximum length to store there
- * @connectable: whether to store parent information
+ * @parent:  parent directory inode, if wanted
  *
  * This default encode_fh function assumes that the 32 inode number
  * is suitable for locating an inode, and that the generation number
  * can be used to check that it is still valid.  It places them in the
  * filehandle fragment where export_decode_fh expects to find them.
  */
-static int export_encode_fh(struct dentry *dentry, struct fid *fid,
-               int *max_len, int connectable)
+static int export_encode_fh(struct inode *inode, struct fid *fid,
+               int *max_len, struct inode *parent)
 {
-       struct inode * inode = dentry->d_inode;
        int len = *max_len;
        int type = FILEID_INO32_GEN;
 
-       if (connectable && (len < 4)) {
+       if (parent && (len < 4)) {
                *max_len = 4;
                return 255;
        } else if (len < 2) {
@@ -332,14 +331,9 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
        len = 2;
        fid->i32.ino = inode->i_ino;
        fid->i32.gen = inode->i_generation;
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                fid->i32.parent_ino = parent->i_ino;
                fid->i32.parent_gen = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
                len = 4;
                type = FILEID_INO32_GEN_PARENT;
        }
@@ -352,11 +346,22 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
 {
        const struct export_operations *nop = dentry->d_sb->s_export_op;
        int error;
+       struct dentry *p = NULL;
+       struct inode *inode = dentry->d_inode, *parent = NULL;
 
+       if (connectable && !S_ISDIR(inode->i_mode)) {
+               p = dget_parent(dentry);
+               /*
+                * note that while p might've ceased to be our parent already,
+                * it's still pinned by and still positive.
+                */
+               parent = p->d_inode;
+       }
        if (nop->encode_fh)
-               error = nop->encode_fh(dentry, fid->raw, max_len, connectable);
+               error = nop->encode_fh(inode, fid->raw, max_len, parent);
        else
-               error = export_encode_fh(dentry, fid, max_len, connectable);
+               error = export_encode_fh(inode, fid, max_len, parent);
+       dput(p);
 
        return error;
 }
index 9ed1bb1f319f381b700a6386a4d8d068d04e0fdf..c22f17021b6eee7ca942a3525eb9f4fd23de6011 100644 (file)
@@ -2,6 +2,8 @@ config EXT4_FS
        tristate "The Extended 4 (ext4) filesystem"
        select JBD2
        select CRC16
+       select CRYPTO
+       select CRYPTO_CRC32C
        help
          This is the next generation of the ext3 filesystem.
 
index c45c41129a35b7346463e0f18e847b78a52e0426..99b6324290db916466d8b5c0633e9fa216d21798 100644 (file)
@@ -168,12 +168,14 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 
        /* If checksum is bad mark all blocks used to prevent allocation
         * essentially implementing a per-group read-only flag. */
-       if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+       if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                ext4_error(sb, "Checksum bad for group %u", block_group);
                ext4_free_group_clusters_set(sb, gdp, 0);
                ext4_free_inodes_set(sb, gdp, 0);
                ext4_itable_unused_set(sb, gdp, 0);
                memset(bh->b_data, 0xff, sb->s_blocksize);
+               ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
+                                          EXT4_BLOCKS_PER_GROUP(sb) / 8);
                return;
        }
        memset(bh->b_data, 0, sb->s_blocksize);
@@ -210,6 +212,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
         */
        ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
                             sb->s_blocksize * 8, bh->b_data);
+       ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
 }
 
 /* Return the number of free blocks in a block group.  It is used when
@@ -276,9 +281,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 }
 
 static int ext4_valid_block_bitmap(struct super_block *sb,
-                                       struct ext4_group_desc *desc,
-                                       unsigned int block_group,
-                                       struct buffer_head *bh)
+                                  struct ext4_group_desc *desc,
+                                  unsigned int block_group,
+                                  struct buffer_head *bh)
 {
        ext4_grpblk_t offset;
        ext4_grpblk_t next_zero_bit;
@@ -325,6 +330,23 @@ err_out:
                        block_group, bitmap_blk);
        return 0;
 }
+
+void ext4_validate_block_bitmap(struct super_block *sb,
+                              struct ext4_group_desc *desc,
+                              unsigned int block_group,
+                              struct buffer_head *bh)
+{
+       if (buffer_verified(bh))
+               return;
+
+       ext4_lock_group(sb, block_group);
+       if (ext4_valid_block_bitmap(sb, desc, block_group, bh) &&
+           ext4_block_bitmap_csum_verify(sb, block_group, desc, bh,
+                                         EXT4_BLOCKS_PER_GROUP(sb) / 8))
+               set_buffer_verified(bh);
+       ext4_unlock_group(sb, block_group);
+}
+
 /**
  * ext4_read_block_bitmap()
  * @sb:                        super block
@@ -355,12 +377,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
        }
 
        if (bitmap_uptodate(bh))
-               return bh;
+               goto verify;
 
        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
-               return bh;
+               goto verify;
        }
        ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -379,7 +401,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
-               return bh;
+               goto verify;
        }
        /*
         * submit the buffer_head for reading
@@ -390,6 +412,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
        get_bh(bh);
        submit_bh(READ, bh);
        return bh;
+verify:
+       ext4_validate_block_bitmap(sb, desc, block_group, bh);
+       return bh;
 }
 
 /* Returns 0 on success, 1 on error */
@@ -412,7 +437,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
        }
        clear_buffer_new(bh);
        /* Panic or remount fs read-only if block bitmap is invalid */
-       ext4_valid_block_bitmap(sb, desc, block_group, bh);
+       ext4_validate_block_bitmap(sb, desc, block_group, bh);
        return 0;
 }
 
index fa3af81ac565c16dba6237edc89c3c7d70c5fc61..b319721da26ae32010adcd46db7e2d98ec50887a 100644 (file)
@@ -29,3 +29,86 @@ unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
 
 #endif  /*  EXT4FS_DEBUG  */
 
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz)
+{
+       __u32 hi;
+       __u32 provided, calculated;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+       calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
+               hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
+               provided |= (hi << 16);
+       } else
+               calculated &= 0xFFFF;
+
+       return provided == calculated;
+}
+
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz)
+{
+       __u32 csum;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+       if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+               gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
+
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz)
+{
+       __u32 hi;
+       __u32 provided, calculated;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+       calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
+               hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
+               provided |= (hi << 16);
+       } else
+               calculated &= 0xFFFF;
+
+       if (provided == calculated)
+               return 1;
+
+       ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
+       return 0;
+}
+
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz)
+{
+       __u32 csum;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+       if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+               gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
index b86786202643bdd8044ee85fb72a0a21bc2c9bef..aa39e600d15954244aead38f7aed30513ce86d65 100644 (file)
@@ -179,6 +179,18 @@ static int ext4_readdir(struct file *filp,
                        continue;
                }
 
+               /* Check the checksum */
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(inode,
+                               (struct ext4_dir_entry *)bh->b_data)) {
+                       EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+                                       "at offset %llu",
+                                       (unsigned long long)filp->f_pos);
+                       filp->f_pos += sb->s_blocksize - offset;
+                       continue;
+               }
+               set_buffer_verified(bh);
+
 revalidate:
                /* If the dir block has changed since the last call to
                 * readdir(2), then we might be pointing to an invalid
index c21b1de51afbb42191adea4fc4a357e3906c8489..cfc4e01b3c8370c642681824ef55b13a66683c0d 100644 (file)
@@ -29,6 +29,7 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#include <crypto/hash.h>
 #ifdef __KERNEL__
 #include <linux/compat.h>
 #endif
@@ -298,7 +299,9 @@ struct ext4_group_desc
        __le16  bg_free_inodes_count_lo;/* Free inodes count */
        __le16  bg_used_dirs_count_lo;  /* Directories count */
        __le16  bg_flags;               /* EXT4_BG_flags (INODE_UNINIT, etc) */
-       __u32   bg_reserved[2];         /* Likely block/inode bitmap checksum */
+       __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
+       __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
+       __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
        __le16  bg_itable_unused_lo;    /* Unused inodes count */
        __le16  bg_checksum;            /* crc16(sb_uuid+group+desc) */
        __le32  bg_block_bitmap_hi;     /* Blocks bitmap block MSB */
@@ -308,9 +311,19 @@ struct ext4_group_desc
        __le16  bg_free_inodes_count_hi;/* Free inodes count MSB */
        __le16  bg_used_dirs_count_hi;  /* Directories count MSB */
        __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
-       __u32   bg_reserved2[3];
+       __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
+       __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
+       __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
+       __u32   bg_reserved;
 };
 
+#define EXT4_BG_INODE_BITMAP_CSUM_HI_END       \
+       (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
+        sizeof(__le16))
+#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END       \
+       (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
+        sizeof(__le16))
+
 /*
  * Structure of a flex block group info
  */
@@ -650,7 +663,8 @@ struct ext4_inode {
                        __le16  l_i_file_acl_high;
                        __le16  l_i_uid_high;   /* these 2 fields */
                        __le16  l_i_gid_high;   /* were reserved2[0] */
-                       __u32   l_i_reserved2;
+                       __le16  l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
+                       __le16  l_i_reserved;
                } linux2;
                struct {
                        __le16  h_i_reserved1;  /* Obsoleted fragment number/size which are removed in ext4 */
@@ -666,7 +680,7 @@ struct ext4_inode {
                } masix2;
        } osd2;                         /* OS dependent 2 */
        __le16  i_extra_isize;
-       __le16  i_pad1;
+       __le16  i_checksum_hi;  /* crc32c(uuid+inum+inode) BE */
        __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
        __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
        __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
@@ -768,7 +782,7 @@ do {                                                                               \
 #define i_gid_low      i_gid
 #define i_uid_high     osd2.linux2.l_i_uid_high
 #define i_gid_high     osd2.linux2.l_i_gid_high
-#define i_reserved2    osd2.linux2.l_i_reserved2
+#define i_checksum_lo  osd2.linux2.l_i_checksum_lo
 
 #elif defined(__GNU__)
 
@@ -908,6 +922,9 @@ struct ext4_inode_info {
         */
        tid_t i_sync_tid;
        tid_t i_datasync_tid;
+
+       /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
+       __u32 i_csum_seed;
 };
 
 /*
@@ -1001,6 +1018,9 @@ extern void ext4_set_bits(void *bm, int cur, int len);
 #define EXT4_ERRORS_PANIC              3       /* Panic */
 #define EXT4_ERRORS_DEFAULT            EXT4_ERRORS_CONTINUE
 
+/* Metadata checksum algorithm codes */
+#define EXT4_CRC32C_CHKSUM             1
+
 /*
  * Structure of the super block
  */
@@ -1087,7 +1107,7 @@ struct ext4_super_block {
        __le64  s_mmp_block;            /* Block for multi-mount protection */
        __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
-       __u8    s_reserved_char_pad;
+       __u8    s_checksum_type;        /* metadata checksum algorithm used */
        __le16  s_reserved_pad;
        __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
        __le32  s_snapshot_inum;        /* Inode number of active snapshot */
@@ -1113,7 +1133,8 @@ struct ext4_super_block {
        __le32  s_usr_quota_inum;       /* inode for tracking user quota */
        __le32  s_grp_quota_inum;       /* inode for tracking group quota */
        __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
-       __le32  s_reserved[109];        /* Padding to the end of the block */
+       __le32  s_reserved[108];        /* Padding to the end of the block */
+       __le32  s_checksum;             /* crc32c(superblock) */
 };
 
 #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1176,6 +1197,7 @@ struct ext4_sb_info {
        struct proc_dir_entry *s_proc;
        struct kobject s_kobj;
        struct completion s_kobj_unregister;
+       struct super_block *s_sb;
 
        /* Journaling */
        struct journal_s *s_journal;
@@ -1266,6 +1288,12 @@ struct ext4_sb_info {
 
        /* record the last minlen when FITRIM is called. */
        atomic_t s_last_trim_minblks;
+
+       /* Reference to checksum algorithm driver via cryptoapi */
+       struct crypto_shash *s_chksum_driver;
+
+       /* Precomputed FS UUID checksum for seeding other checksums */
+       __u32 s_csum_seed;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1414,6 +1442,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
 #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE     0x0040
 #define EXT4_FEATURE_RO_COMPAT_QUOTA           0x0100
 #define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
+/*
+ * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
+ * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
+ * all other data structures' checksums.  However, the METADATA_CSUM and
+ * GDT_CSUM bits are mutually exclusive.
+ */
 #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM   0x0400
 
 #define EXT4_FEATURE_INCOMPAT_COMPRESSION      0x0001
@@ -1461,7 +1495,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
                                         EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                         EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                         EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
-                                        EXT4_FEATURE_RO_COMPAT_BIGALLOC)
+                                        EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
+                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)
 
 /*
  * Default values for user and/or group using reserved blocks
@@ -1526,6 +1561,18 @@ struct ext4_dir_entry_2 {
        char    name[EXT4_NAME_LEN];    /* File name */
 };
 
+/*
+ * This is a bogus directory entry at the end of each leaf block that
+ * records checksums.
+ */
+struct ext4_dir_entry_tail {
+       __le32  det_reserved_zero1;     /* Pretend to be unused */
+       __le16  det_rec_len;            /* 12 */
+       __u8    det_reserved_zero2;     /* Zero name length */
+       __u8    det_reserved_ft;        /* 0xDE, fake file type */
+       __le32  det_checksum;           /* crc32c(uuid+inum+dirblock) */
+};
+
 /*
  * Ext4 directory file types.  Only the low 3 bits are used.  The
  * other bits are reserved for now.
@@ -1541,6 +1588,8 @@ struct ext4_dir_entry_2 {
 
 #define EXT4_FT_MAX            8
 
+#define EXT4_FT_DIR_CSUM       0xDE
+
 /*
  * EXT4_DIR_PAD defines the directory entries boundaries
  *
@@ -1609,6 +1658,25 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 #define DX_HASH_HALF_MD4_UNSIGNED      4
 #define DX_HASH_TEA_UNSIGNED           5
 
+static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
+                             const void *address, unsigned int length)
+{
+       struct {
+               struct shash_desc shash;
+               char ctx[crypto_shash_descsize(sbi->s_chksum_driver)];
+       } desc;
+       int err;
+
+       desc.shash.tfm = sbi->s_chksum_driver;
+       desc.shash.flags = 0;
+       *(u32 *)desc.ctx = crc;
+
+       err = crypto_shash_update(&desc.shash, address, length);
+       BUG_ON(err);
+
+       return *(u32 *)desc.ctx;
+}
+
 #ifdef __KERNEL__
 
 /* hash info structure used by the directory hash */
@@ -1741,7 +1809,8 @@ struct mmp_struct {
        __le16  mmp_check_interval;
 
        __le16  mmp_pad1;
-       __le32  mmp_pad2[227];
+       __le32  mmp_pad2[226];
+       __le32  mmp_checksum;           /* crc32c(uuid+mmp_block) */
 };
 
 /* arguments passed to the mmp thread */
@@ -1784,8 +1853,24 @@ struct mmpd_data {
 
 /* bitmap.c */
 extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz);
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz);
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz);
 
 /* balloc.c */
+extern void ext4_validate_block_bitmap(struct super_block *sb,
+                                      struct ext4_group_desc *desc,
+                                      unsigned int block_group,
+                                      struct buffer_head *bh);
 extern unsigned int ext4_block_group(struct super_block *sb,
                        ext4_fsblk_t blocknr);
 extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1864,7 +1949,7 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 /* mballoc.c */
 extern long ext4_mb_stats;
 extern long ext4_mb_max_to_scan;
-extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_init(struct super_block *);
 extern int ext4_mb_release(struct super_block *);
 extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                struct ext4_allocation_request *, int *);
@@ -1936,6 +2021,8 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern int ext4_ext_migrate(struct inode *);
 
 /* namei.c */
+extern int ext4_dirent_csum_verify(struct inode *inode,
+                                  struct ext4_dir_entry *dirent);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1950,6 +2037,10 @@ extern int ext4_group_extend(struct super_block *sb,
 extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
 
 /* super.c */
+extern int ext4_superblock_csum_verify(struct super_block *sb,
+                                      struct ext4_super_block *es);
+extern void ext4_superblock_csum_set(struct super_block *sb,
+                                    struct ext4_super_block *es);
 extern void *ext4_kvmalloc(size_t size, gfp_t flags);
 extern void *ext4_kvzalloc(size_t size, gfp_t flags);
 extern void ext4_kvfree(void *ptr);
@@ -2025,10 +2116,17 @@ extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-                                  struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                       struct ext4_group_desc *gdp);
+extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
+                                    struct ext4_group_desc *gdp);
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+       return EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+}
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -2225,6 +2323,9 @@ static inline void ext4_unlock_group(struct super_block *sb,
 
 static inline void ext4_mark_super_dirty(struct super_block *sb)
 {
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+       ext4_superblock_csum_set(sb, es);
        if (EXT4_SB(sb)->s_journal == NULL)
                sb->s_dirt =1;
 }
@@ -2314,6 +2415,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
+extern int ext4_mmp_csum_verify(struct super_block *sb,
+                               struct mmp_struct *mmp);
 
 /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
 enum ext4_state_bits {
index 0f58b86e3a0206e19626f361f453aa80b2838857..cb1b2c919963290fd10d09ba12f6d8c53ace9fa6 100644 (file)
  * ext4_inode has i_block array (60 bytes total).
  * The first 12 bytes store ext4_extent_header;
  * the remainder stores an array of ext4_extent.
+ * For non-inode extent blocks, ext4_extent_tail
+ * follows the array.
  */
 
+/*
+ * This is the extent tail on-disk structure.
+ * All other extent structures are 12 bytes long.  It turns out that
+ * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
+ * covers all valid ext4 block sizes.  Therefore, this tail structure can be
+ * crammed into the end of the block without having to rebalance the tree.
+ */
+struct ext4_extent_tail {
+       __le32  et_checksum;    /* crc32c(uuid+inum+extent_block) */
+};
+
 /*
  * This is the extent on-disk structure.
  * It's used at the bottom of the tree.
@@ -101,6 +114,17 @@ struct ext4_extent_header {
 
 #define EXT4_EXT_MAGIC         cpu_to_le16(0xf30a)
 
+#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
+       (sizeof(struct ext4_extent_header) + \
+        (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
+
+static inline struct ext4_extent_tail *
+find_ext4_extent_tail(struct ext4_extent_header *eh)
+{
+       return (struct ext4_extent_tail *)(((void *)eh) +
+                                          EXT4_EXTENT_TAIL_OFFSET(eh));
+}
+
 /*
  * Array of ext4_ext_path contains path to some extent.
  * Creation/lookup routines use it for traversal/splitting/etc.
index aca17901758249d4329d780714e328ef42851e35..90f7c2e84db1bef3fdb90931f9124cfe052705dd 100644 (file)
@@ -138,16 +138,23 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
 }
 
 int __ext4_handle_dirty_super(const char *where, unsigned int line,
-                             handle_t *handle, struct super_block *sb)
+                             handle_t *handle, struct super_block *sb,
+                             int now)
 {
        struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
        int err = 0;
 
        if (ext4_handle_valid(handle)) {
+               ext4_superblock_csum_set(sb,
+                               (struct ext4_super_block *)bh->b_data);
                err = jbd2_journal_dirty_metadata(handle, bh);
                if (err)
                        ext4_journal_abort_handle(where, line, __func__,
                                                  bh, handle, err);
+       } else if (now) {
+               ext4_superblock_csum_set(sb,
+                               (struct ext4_super_block *)bh->b_data);
+               mark_buffer_dirty(bh);
        } else
                sb->s_dirt = 1;
        return err;
index 83b20fcf9400b11b28185470f8feef309faa8252..f440e8f1841f4e2521486bd94ae19ed83aa896ab 100644 (file)
@@ -213,7 +213,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                 struct buffer_head *bh);
 
 int __ext4_handle_dirty_super(const char *where, unsigned int line,
-                             handle_t *handle, struct super_block *sb);
+                             handle_t *handle, struct super_block *sb,
+                             int now);
 
 #define ext4_journal_get_write_access(handle, bh) \
        __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
@@ -225,8 +226,10 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
 #define ext4_handle_dirty_metadata(handle, inode, bh) \
        __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                     (bh))
+#define ext4_handle_dirty_super_now(handle, sb) \
+       __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 1)
 #define ext4_handle_dirty_super(handle, sb) \
-       __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+       __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 0)
 
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
 int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
index abcdeab67f5232b66d4aa5a6cbb88838094f6247..91341ec6e06a94f2f400d10039a64585cd17ed2e 100644 (file)
 #define EXT4_EXT_MARK_UNINIT1  0x2  /* mark first half uninitialized */
 #define EXT4_EXT_MARK_UNINIT2  0x4  /* mark second half uninitialized */
 
+static __le32 ext4_extent_block_csum(struct inode *inode,
+                                    struct ext4_extent_header *eh)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+                          EXT4_EXTENT_TAIL_OFFSET(eh));
+       return cpu_to_le32(csum);
+}
+
+static int ext4_extent_block_csum_verify(struct inode *inode,
+                                        struct ext4_extent_header *eh)
+{
+       struct ext4_extent_tail *et;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       et = find_ext4_extent_tail(eh);
+       if (et->et_checksum != ext4_extent_block_csum(inode, eh))
+               return 0;
+       return 1;
+}
+
+static void ext4_extent_block_csum_set(struct inode *inode,
+                                      struct ext4_extent_header *eh)
+{
+       struct ext4_extent_tail *et;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       et = find_ext4_extent_tail(eh);
+       et->et_checksum = ext4_extent_block_csum(inode, eh);
+}
+
 static int ext4_split_extent(handle_t *handle,
                                struct inode *inode,
                                struct ext4_ext_path *path,
@@ -117,6 +157,7 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
 {
        int err;
        if (path->p_bh) {
+               ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                /* path points to block */
                err = __ext4_handle_dirty_metadata(where, line, handle,
                                                   inode, path->p_bh);
@@ -391,6 +432,12 @@ static int __ext4_ext_check(const char *function, unsigned int line,
                error_msg = "invalid extent entries";
                goto corrupted;
        }
+       /* Verify checksum on non-root extent tree nodes */
+       if (ext_depth(inode) != depth &&
+           !ext4_extent_block_csum_verify(inode, eh)) {
+               error_msg = "extent tree corrupted";
+               goto corrupted;
+       }
        return 0;
 
 corrupted:
@@ -412,6 +459,26 @@ int ext4_ext_check_inode(struct inode *inode)
        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
 }
 
+static int __ext4_ext_check_block(const char *function, unsigned int line,
+                                 struct inode *inode,
+                                 struct ext4_extent_header *eh,
+                                 int depth,
+                                 struct buffer_head *bh)
+{
+       int ret;
+
+       if (buffer_verified(bh))
+               return 0;
+       ret = ext4_ext_check(inode, eh, depth);
+       if (ret)
+               return ret;
+       set_buffer_verified(bh);
+       return ret;
+}
+
+#define ext4_ext_check_block(inode, eh, depth, bh)     \
+       __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
+
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
 {
@@ -536,7 +603,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
        }
 
        path->p_idx = l - 1;
-       ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+       ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
                  ext4_idx_pblock(path->p_idx));
 
 #ifdef CHECK_BINSEARCH
@@ -668,8 +735,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        i = depth;
        /* walk through the tree */
        while (i) {
-               int need_to_validate = 0;
-
                ext_debug("depth %d: num %d, max %d\n",
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
 
@@ -688,8 +753,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                                put_bh(bh);
                                goto err;
                        }
-                       /* validate the extent entries */
-                       need_to_validate = 1;
                }
                eh = ext_block_hdr(bh);
                ppos++;
@@ -703,7 +766,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_hdr = eh;
                i--;
 
-               if (need_to_validate && ext4_ext_check(inode, eh, i))
+               if (ext4_ext_check_block(inode, eh, i, bh))
                        goto err;
        }
 
@@ -914,6 +977,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                le16_add_cpu(&neh->eh_entries, m);
        }
 
+       ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
 
@@ -992,6 +1056,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                                sizeof(struct ext4_extent_idx) * m);
                        le16_add_cpu(&neh->eh_entries, m);
                }
+               ext4_extent_block_csum_set(inode, neh);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
 
@@ -1089,6 +1154,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        else
                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
+       ext4_extent_block_csum_set(inode, neh);
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
 
@@ -1344,7 +1410,8 @@ got_index:
                        return -EIO;
                eh = ext_block_hdr(bh);
                /* subtract from p_depth to get proper eh_depth */
-               if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+               if (ext4_ext_check_block(inode, eh,
+                                        path->p_depth - depth, bh)) {
                        put_bh(bh);
                        return -EIO;
                }
@@ -1357,7 +1424,7 @@ got_index:
        if (bh == NULL)
                return -EIO;
        eh = ext_block_hdr(bh);
-       if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+       if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
                put_bh(bh);
                return -EIO;
        }
@@ -2644,8 +2711,8 @@ cont:
                                err = -EIO;
                                break;
                        }
-                       if (ext4_ext_check(inode, ext_block_hdr(bh),
-                                                       depth - i - 1)) {
+                       if (ext4_ext_check_block(inode, ext_block_hdr(bh),
+                                                       depth - i - 1, bh)) {
                                err = -EIO;
                                break;
                        }
@@ -4722,8 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
 
        /* Now release the pages */
        if (last_page_offset > first_page_offset) {
-               truncate_inode_pages_range(mapping, first_page_offset,
-                                          last_page_offset-1);
+               truncate_pagecache_range(inode, first_page_offset,
+                                        last_page_offset - 1);
        }
 
        /* finish any pending end_io work */
index cb70f1812a70f5ca8452e98776cd309ad6638055..8c7642a00054fd1ddf649e733e4b6efb5a0eb14b 100644 (file)
@@ -95,7 +95,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
 {
        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
        int unaligned_aio = 0;
-       int ret;
+       ssize_t ret;
 
        /*
         * If we have encountered a bitmap-format file, the size limit
index 9f9acac6c43f4ac8006e363b5567b4a88dd56615..d48e8b14928cf993c50c33fe9b18a90203c2c492 100644 (file)
@@ -70,24 +70,27 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
                                       ext4_group_t block_group,
                                       struct ext4_group_desc *gdp)
 {
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-
        J_ASSERT_BH(bh, buffer_locked(bh));
 
        /* If checksum is bad mark all blocks and inodes use to prevent
         * allocation, essentially implementing a per-group read-only flag. */
-       if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+       if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                ext4_error(sb, "Checksum bad for group %u", block_group);
                ext4_free_group_clusters_set(sb, gdp, 0);
                ext4_free_inodes_set(sb, gdp, 0);
                ext4_itable_unused_set(sb, gdp, 0);
                memset(bh->b_data, 0xff, sb->s_blocksize);
+               ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+                                          EXT4_INODES_PER_GROUP(sb) / 8);
                return 0;
        }
 
        memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
        ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                        bh->b_data);
+       ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+                                  EXT4_INODES_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
 
        return EXT4_INODES_PER_GROUP(sb);
 }
@@ -128,12 +131,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                return NULL;
        }
        if (bitmap_uptodate(bh))
-               return bh;
+               goto verify;
 
        lock_buffer(bh);
        if (bitmap_uptodate(bh)) {
                unlock_buffer(bh);
-               return bh;
+               goto verify;
        }
 
        ext4_lock_group(sb, block_group);
@@ -141,6 +144,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
+               set_buffer_verified(bh);
                ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
@@ -154,7 +158,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                 */
                set_bitmap_uptodate(bh);
                unlock_buffer(bh);
-               return bh;
+               goto verify;
        }
        /*
         * submit the buffer_head for reading
@@ -171,6 +175,20 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                           block_group, bitmap_blk);
                return NULL;
        }
+
+verify:
+       ext4_lock_group(sb, block_group);
+       if (!buffer_verified(bh) &&
+           !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+                                          EXT4_INODES_PER_GROUP(sb) / 8)) {
+               ext4_unlock_group(sb, block_group);
+               put_bh(bh);
+               ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+                          "inode_bitmap = %llu", block_group, bitmap_blk);
+               return NULL;
+       }
+       ext4_unlock_group(sb, block_group);
+       set_buffer_verified(bh);
        return bh;
 }
 
@@ -276,7 +294,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                ext4_used_dirs_set(sb, gdp, count);
                percpu_counter_dec(&sbi->s_dirs_counter);
        }
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+       ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+                                  EXT4_INODES_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
        ext4_unlock_group(sb, block_group);
 
        percpu_counter_inc(&sbi->s_freeinodes_counter);
@@ -488,10 +508,12 @@ fallback_retry:
        for (i = 0; i < ngroups; i++) {
                grp = (parent_group + i) % ngroups;
                desc = ext4_get_group_desc(sb, grp, NULL);
-               grp_free = ext4_free_inodes_count(sb, desc);
-               if (desc && grp_free && grp_free >= avefreei) {
-                       *group = grp;
-                       return 0;
+               if (desc) {
+                       grp_free = ext4_free_inodes_count(sb, desc);
+                       if (grp_free && grp_free >= avefreei) {
+                               *group = grp;
+                               return 0;
+                       }
                }
        }
 
@@ -709,7 +731,7 @@ repeat_in_this_group:
 
 got:
        /* We may have to initialize the block bitmap if it isn't already */
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+       if (ext4_has_group_desc_csum(sb) &&
            gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                struct buffer_head *block_bitmap_bh;
 
@@ -731,8 +753,11 @@ got:
                        gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                        ext4_free_group_clusters_set(sb, gdp,
                                ext4_free_clusters_after_init(sb, group, gdp));
-                       gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
-                                                               gdp);
+                       ext4_block_bitmap_csum_set(sb, group, gdp,
+                                                  block_bitmap_bh,
+                                                  EXT4_BLOCKS_PER_GROUP(sb) /
+                                                  8);
+                       ext4_group_desc_csum_set(sb, group, gdp);
                }
                ext4_unlock_group(sb, group);
 
@@ -751,7 +776,7 @@ got:
                goto fail;
 
        /* Update the relevant bg descriptor fields */
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+       if (ext4_has_group_desc_csum(sb)) {
                int free;
                struct ext4_group_info *grp = ext4_get_group_info(sb, group);
 
@@ -772,7 +797,10 @@ got:
                        ext4_itable_unused_set(sb, gdp,
                                        (EXT4_INODES_PER_GROUP(sb) - ino));
                up_read(&grp->alloc_sem);
+       } else {
+               ext4_lock_group(sb, group);
        }
+
        ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
        if (S_ISDIR(mode)) {
                ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
@@ -782,10 +810,12 @@ got:
                        atomic_inc(&sbi->s_flex_groups[f].used_dirs);
                }
        }
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-               gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-               ext4_unlock_group(sb, group);
+       if (ext4_has_group_desc_csum(sb)) {
+               ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+                                          EXT4_INODES_PER_GROUP(sb) / 8);
+               ext4_group_desc_csum_set(sb, group, gdp);
        }
+       ext4_unlock_group(sb, group);
 
        BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
        err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
@@ -850,6 +880,19 @@ got:
        inode->i_generation = sbi->s_next_generation++;
        spin_unlock(&sbi->s_next_gen_lock);
 
+       /* Precompute checksum seed for inode metadata */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               __u32 csum;
+               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+               __le32 inum = cpu_to_le32(inode->i_ino);
+               __le32 gen = cpu_to_le32(inode->i_generation);
+               csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+                                  sizeof(inum));
+               ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+                                             sizeof(gen));
+       }
+
        ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
        ext4_set_inode_state(inode, EXT4_STATE_NEW);
 
@@ -1140,7 +1183,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 skip_zeroout:
        ext4_lock_group(sb, group);
        gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+       ext4_group_desc_csum_set(sb, group, gdp);
        ext4_unlock_group(sb, group);
 
        BUFFER_TRACE(group_desc_bh,
index 07eaf565fdcb2ad4fba4f92c6fe55a01b2fea17b..02bc8cbe7281b3d47c3449a1c4b8e4220685ba52 100644 (file)
 
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+                             struct ext4_inode_info *ei)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       __u16 csum_lo;
+       __u16 csum_hi = 0;
+       __u32 csum;
+
+       csum_lo = raw->i_checksum_lo;
+       raw->i_checksum_lo = 0;
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+               csum_hi = raw->i_checksum_hi;
+               raw->i_checksum_hi = 0;
+       }
+
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+                          EXT4_INODE_SIZE(inode->i_sb));
+
+       raw->i_checksum_lo = csum_lo;
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+               raw->i_checksum_hi = csum_hi;
+
+       return csum;
+}
+
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+                                 struct ext4_inode_info *ei)
+{
+       __u32 provided, calculated;
+
+       if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+           cpu_to_le32(EXT4_OS_LINUX) ||
+           !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       provided = le16_to_cpu(raw->i_checksum_lo);
+       calculated = ext4_inode_csum(inode, raw, ei);
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+               provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+       else
+               calculated &= 0xFFFF;
+
+       return provided == calculated;
+}
+
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+                               struct ext4_inode_info *ei)
+{
+       __u32 csum;
+
+       if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+           cpu_to_le32(EXT4_OS_LINUX) ||
+           !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       csum = ext4_inode_csum(inode, raw, ei);
+       raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+               raw->i_checksum_hi = cpu_to_le16(csum >> 16);
+}
+
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                              loff_t new_size)
 {
@@ -3517,8 +3584,7 @@ make_io:
                                b = table;
                        end = b + EXT4_SB(sb)->s_inode_readahead_blks;
                        num = EXT4_INODES_PER_GROUP(sb);
-                       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                      EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+                       if (ext4_has_group_desc_csum(sb))
                                num -= ext4_itable_unused_count(sb, gdp);
                        table += num / inodes_per_block;
                        if (end > table)
@@ -3646,6 +3712,39 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        if (ret < 0)
                goto bad_inode;
        raw_inode = ext4_raw_inode(&iloc);
+
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+               ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+               if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+                   EXT4_INODE_SIZE(inode->i_sb)) {
+                       EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+                               EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+                               EXT4_INODE_SIZE(inode->i_sb));
+                       ret = -EIO;
+                       goto bad_inode;
+               }
+       } else
+               ei->i_extra_isize = 0;
+
+       /* Precompute checksum seed for inode metadata */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+               __u32 csum;
+               __le32 inum = cpu_to_le32(inode->i_ino);
+               __le32 gen = raw_inode->i_generation;
+               csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+                                  sizeof(inum));
+               ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+                                             sizeof(gen));
+       }
+
+       if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+               EXT4_ERROR_INODE(inode, "checksum invalid");
+               ret = -EIO;
+               goto bad_inode;
+       }
+
        inode->i_mode = le16_to_cpu(raw_inode->i_mode);
        i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
        i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -3725,12 +3824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        }
 
        if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-               ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
-               if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
-                   EXT4_INODE_SIZE(inode->i_sb)) {
-                       ret = -EIO;
-                       goto bad_inode;
-               }
                if (ei->i_extra_isize == 0) {
                        /* The extra space is currently unused. Use it. */
                        ei->i_extra_isize = sizeof(struct ext4_inode) -
@@ -3742,8 +3835,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
                                ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                }
-       } else
-               ei->i_extra_isize = 0;
+       }
 
        EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
        EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
@@ -3942,7 +4034,7 @@ static int ext4_do_update_inode(handle_t *handle,
                        EXT4_SET_RO_COMPAT_FEATURE(sb,
                                        EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                        ext4_handle_sync(handle);
-                       err = ext4_handle_dirty_super(handle, sb);
+                       err = ext4_handle_dirty_super_now(handle, sb);
                }
        }
        raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -3969,6 +4061,8 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        }
 
+       ext4_inode_csum_set(inode, raw_inode, ei);
+
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        rc = ext4_handle_dirty_metadata(handle, NULL, bh);
        if (!err)
@@ -4213,7 +4307,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
         * will return the blocks that include the delayed allocation
         * blocks for this file.
         */
-       delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+       delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+                               EXT4_I(inode)->i_reserved_data_blocks);
 
        stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
        return 0;
index 6eee25591b8159bc96d35a16f94f94c0855a35b9..8ad112ae0ade2f21a953ccc03b687939b0b81310 100644 (file)
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                handle_t *handle = NULL;
                int err, migrate = 0;
                struct ext4_iloc iloc;
-               unsigned int oldflags;
+               unsigned int oldflags, mask, i;
                unsigned int jflag;
 
                if (!inode_owner_or_capable(inode))
@@ -115,8 +115,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (err)
                        goto flags_err;
 
-               flags = flags & EXT4_FL_USER_MODIFIABLE;
-               flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
+               for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+                       if (!(mask & EXT4_FL_USER_MODIFIABLE))
+                               continue;
+                       if (mask & flags)
+                               ext4_set_inode_flag(inode, i);
+                       else
+                               ext4_clear_inode_flag(inode, i);
+               }
                ei->i_flags = flags;
 
                ext4_set_inode_flags(inode);
@@ -152,6 +158,13 @@ flags_out:
                if (!inode_owner_or_capable(inode))
                        return -EPERM;
 
+               if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+                       ext4_warning(sb, "Setting inode version is not "
+                                    "supported with metadata_csum enabled.");
+                       return -ENOTTY;
+               }
+
                err = mnt_want_write_file(filp);
                if (err)
                        return err;
index 99ab428bcfa089822e74b433aee7b1bf4076e34d..1cd6994fc446008b74dc9b77863edf0f24e14c33 100644 (file)
@@ -788,7 +788,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        int first_block;
        struct super_block *sb;
        struct buffer_head *bhs;
-       struct buffer_head **bh;
+       struct buffer_head **bh = NULL;
        struct inode *inode;
        char *data;
        char *bitmap;
@@ -2375,7 +2375,7 @@ static int ext4_groupinfo_create_slab(size_t size)
        return 0;
 }
 
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+int ext4_mb_init(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        unsigned i, j;
@@ -2517,6 +2517,9 @@ int ext4_mb_release(struct super_block *sb)
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
 
+       if (sbi->s_proc)
+               remove_proc_entry("mb_groups", sbi->s_proc);
+
        if (sbi->s_group_info) {
                for (i = 0; i < ngroups; i++) {
                        grinfo = ext4_get_group_info(sb, i);
@@ -2564,8 +2567,6 @@ int ext4_mb_release(struct super_block *sb)
        }
 
        free_percpu(sbi->s_locality_groups);
-       if (sbi->s_proc)
-               remove_proc_entry("mb_groups", sbi->s_proc);
 
        return 0;
 }
@@ -2797,7 +2798,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        }
        len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
        ext4_free_group_clusters_set(sb, gdp, len);
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+       ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
 
        ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
        percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
@@ -3071,13 +3074,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
 static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
 {
        struct ext4_prealloc_space *pa = ac->ac_pa;
-       int len;
-
-       if (pa && pa->pa_type == MB_INODE_PA) {
-               len = ac->ac_b_ex.fe_len;
-               pa->pa_free += len;
-       }
 
+       if (pa && pa->pa_type == MB_INODE_PA)
+               pa->pa_free += ac->ac_b_ex.fe_len;
 }
 
 /*
@@ -4636,6 +4635,7 @@ do_more:
                 */
                new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
                if (!new_entry) {
+                       ext4_mb_unload_buddy(&e4b);
                        err = -ENOMEM;
                        goto error_return;
                }
@@ -4659,7 +4659,9 @@ do_more:
 
        ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
        ext4_free_group_clusters_set(sb, gdp, ret);
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+       ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
 
@@ -4803,7 +4805,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
        mb_free_blocks(NULL, &e4b, bit, count);
        blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
        ext4_free_group_clusters_set(sb, desc, blk_free_count);
-       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+       ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, desc);
        ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeclusters_counter,
                           EXT4_B2C(sbi, blocks_freed));
index ed6548d89165e1d9c31118aca21d3e89a3772ab2..f99a1311e84765296b0a0a04534e0be0536915bc 100644 (file)
@@ -6,12 +6,45 @@
 
 #include "ext4.h"
 
+/* Checksumming functions */
+static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int offset = offsetof(struct mmp_struct, mmp_checksum);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+
+       return cpu_to_le32(csum);
+}
+
+int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
 /*
  * Write the MMP block using WRITE_SYNC to try to get the block on-disk
  * faster.
  */
-static int write_mmp_block(struct buffer_head *bh)
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 {
+       struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+       ext4_mmp_csum_set(sb, mmp);
        mark_buffer_dirty(bh);
        lock_buffer(bh);
        bh->b_end_io = end_buffer_write_sync;
@@ -59,7 +92,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
        }
 
        mmp = (struct mmp_struct *)((*bh)->b_data);
-       if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+       if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
+           !ext4_mmp_csum_verify(sb, mmp))
                return -EINVAL;
 
        return 0;
@@ -120,7 +154,7 @@ static int kmmpd(void *data)
                mmp->mmp_time = cpu_to_le64(get_seconds());
                last_update_time = jiffies;
 
-               retval = write_mmp_block(bh);
+               retval = write_mmp_block(sb, bh);
                /*
                 * Don't spew too many error messages. Print one every
                 * (s_mmp_update_interval * 60) seconds.
@@ -200,7 +234,7 @@ static int kmmpd(void *data)
        mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
        mmp->mmp_time = cpu_to_le64(get_seconds());
 
-       retval = write_mmp_block(bh);
+       retval = write_mmp_block(sb, bh);
 
 failed:
        kfree(data);
@@ -299,7 +333,7 @@ skip:
        seq = mmp_new_seq();
        mmp->mmp_seq = cpu_to_le32(seq);
 
-       retval = write_mmp_block(bh);
+       retval = write_mmp_block(sb, bh);
        if (retval)
                goto failed;
 
index e2a3f4b0ff78d6f81fbf2228f12f201e6ab1a024..5845cd97bf8b094b0fc01082279e8d65ee73f241 100644 (file)
@@ -145,6 +145,14 @@ struct dx_map_entry
        u16 size;
 };
 
+/*
+ * This goes at the end of each htree block.
+ */
+struct dx_tail {
+       u32 dt_reserved;
+       __le32 dt_checksum;     /* crc32c(uuid+inum+dirblock) */
+};
+
 static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
 static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
 static inline unsigned dx_get_hash(struct dx_entry *entry);
@@ -180,6 +188,230 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
 
+/* checksumming functions */
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+       ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+                                       ((blocksize) - \
+                                        sizeof(struct ext4_dir_entry_tail))))
+
+static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+                                  unsigned int blocksize)
+{
+       memset(t, 0, sizeof(struct ext4_dir_entry_tail));
+       t->det_rec_len = ext4_rec_len_to_disk(
+                       sizeof(struct ext4_dir_entry_tail), blocksize);
+       t->det_reserved_ft = EXT4_FT_DIR_CSUM;
+}
+
+/* Walk through a dirent block to find a checksum "dirent" at the tail */
+static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
+                                                  struct ext4_dir_entry *de)
+{
+       struct ext4_dir_entry_tail *t;
+
+#ifdef PARANOID
+       struct ext4_dir_entry *d, *top;
+
+       d = de;
+       top = (struct ext4_dir_entry *)(((void *)de) +
+               (EXT4_BLOCK_SIZE(inode->i_sb) -
+               sizeof(struct ext4_dir_entry_tail)));
+       while (d < top && d->rec_len)
+               d = (struct ext4_dir_entry *)(((void *)d) +
+                   le16_to_cpu(d->rec_len));
+
+       if (d != top)
+               return NULL;
+
+       t = (struct ext4_dir_entry_tail *)d;
+#else
+       t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+#endif
+
+       if (t->det_reserved_zero1 ||
+           le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+           t->det_reserved_zero2 ||
+           t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+               return NULL;
+
+       return t;
+}
+
+static __le32 ext4_dirent_csum(struct inode *inode,
+                              struct ext4_dir_entry *dirent, int size)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+       return cpu_to_le32(csum);
+}
+
+int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+       struct ext4_dir_entry_tail *t;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       t = get_dirent_tail(inode, dirent);
+       if (!t) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+                                "leaf for checksum.  Please run e2fsck -D.");
+               return 0;
+       }
+
+       if (t->det_checksum != ext4_dirent_csum(inode, dirent,
+                                               (void *)t - (void *)dirent))
+               return 0;
+
+       return 1;
+}
+
+static void ext4_dirent_csum_set(struct inode *inode,
+                                struct ext4_dir_entry *dirent)
+{
+       struct ext4_dir_entry_tail *t;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       t = get_dirent_tail(inode, dirent);
+       if (!t) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+                                "leaf for checksum.  Please run e2fsck -D.");
+               return;
+       }
+
+       t->det_checksum = ext4_dirent_csum(inode, dirent,
+                                          (void *)t - (void *)dirent);
+}
+
+static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
+                                               struct inode *inode,
+                                               struct buffer_head *bh)
+{
+       ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+       return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
+                                              struct ext4_dir_entry *dirent,
+                                              int *offset)
+{
+       struct ext4_dir_entry *dp;
+       struct dx_root_info *root;
+       int count_offset;
+
+       if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+               count_offset = 8;
+       else if (le16_to_cpu(dirent->rec_len) == 12) {
+               dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
+               if (le16_to_cpu(dp->rec_len) !=
+                   EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+                       return NULL;
+               root = (struct dx_root_info *)(((void *)dp + 12));
+               if (root->reserved_zero ||
+                   root->info_length != sizeof(struct dx_root_info))
+                       return NULL;
+               count_offset = 32;
+       } else
+               return NULL;
+
+       if (offset)
+               *offset = count_offset;
+       return (struct dx_countlimit *)(((void *)dirent) + count_offset);
+}
+
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+                          int count_offset, int count, struct dx_tail *t)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       __u32 csum, old_csum;
+       int size;
+
+       size = count_offset + (count * sizeof(struct dx_entry));
+       old_csum = t->dt_checksum;
+       t->dt_checksum = 0;
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+       csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
+       t->dt_checksum = old_csum;
+
+       return cpu_to_le32(csum);
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+                              struct ext4_dir_entry *dirent)
+{
+       struct dx_countlimit *c;
+       struct dx_tail *t;
+       int count_offset, limit, count;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       c = get_dx_countlimit(inode, dirent, &count_offset);
+       if (!c) {
+               EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+               return 1;
+       }
+       limit = le16_to_cpu(c->limit);
+       count = le16_to_cpu(c->count);
+       if (count_offset + (limit * sizeof(struct dx_entry)) >
+           EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+                                "tree checksum found.  Run e2fsck -D.");
+               return 1;
+       }
+       t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+       if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
+                                           count, t))
+               return 0;
+       return 1;
+}
+
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+       struct dx_countlimit *c;
+       struct dx_tail *t;
+       int count_offset, limit, count;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       c = get_dx_countlimit(inode, dirent, &count_offset);
+       if (!c) {
+               EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+               return;
+       }
+       limit = le16_to_cpu(c->limit);
+       count = le16_to_cpu(c->count);
+       if (count_offset + (limit * sizeof(struct dx_entry)) >
+           EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+                                "tree checksum.  Run e2fsck -D.");
+               return;
+       }
+       t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+       t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+}
+
+static inline int ext4_handle_dirty_dx_node(handle_t *handle,
+                                           struct inode *inode,
+                                           struct buffer_head *bh)
+{
+       ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+       return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
 /*
  * p is at least 6 bytes before the end of page
  */
@@ -239,12 +471,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                EXT4_DIR_REC_LEN(2) - infosize;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
 }
 
 static inline unsigned dx_node_limit(struct inode *dir)
 {
        unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               entry_space -= sizeof(struct dx_tail);
        return entry_space / sizeof(struct dx_entry);
 }
 
@@ -390,6 +630,15 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                goto fail;
        }
 
+       if (!buffer_verified(bh) &&
+           !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
+               ext4_warning(dir->i_sb, "Root failed checksum");
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+       }
+       set_buffer_verified(bh);
+
        entries = (struct dx_entry *) (((char *)&root->info) +
                                       root->info.info_length);
 
@@ -450,6 +699,17 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
                        goto fail2;
                at = entries = ((struct dx_node *) bh->b_data)->entries;
+
+               if (!buffer_verified(bh) &&
+                   !ext4_dx_csum_verify(dir,
+                                        (struct ext4_dir_entry *)bh->b_data)) {
+                       ext4_warning(dir->i_sb, "Node failed checksum");
+                       brelse(bh);
+                       *err = ERR_BAD_DX_DIR;
+                       goto fail;
+               }
+               set_buffer_verified(bh);
+
                if (dx_get_limit(entries) != dx_node_limit (dir)) {
                        ext4_warning(dir->i_sb,
                                     "dx entry: limit != node limit");
@@ -549,6 +809,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
                                      0, &err)))
                        return err; /* Failure */
+
+               if (!buffer_verified(bh) &&
+                   !ext4_dx_csum_verify(dir,
+                                        (struct ext4_dir_entry *)bh->b_data)) {
+                       ext4_warning(dir->i_sb, "Node failed checksum");
+                       return -EIO;
+               }
+               set_buffer_verified(bh);
+
                p++;
                brelse(p->bh);
                p->bh = bh;
@@ -577,6 +846,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
                return err;
 
+       if (!buffer_verified(bh) &&
+           !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
+               return -EIO;
+       set_buffer_verified(bh);
+
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
@@ -936,6 +1210,15 @@ restart:
                        brelse(bh);
                        goto next;
                }
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(dir,
+                               (struct ext4_dir_entry *)bh->b_data)) {
+                       EXT4_ERROR_INODE(dir, "checksumming directory "
+                                        "block %lu", (unsigned long)block);
+                       brelse(bh);
+                       goto next;
+               }
+               set_buffer_verified(bh);
                i = search_dirblock(bh, dir, d_name,
                            block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
                if (i == 1) {
@@ -987,6 +1270,16 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                        goto errout;
 
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(dir,
+                               (struct ext4_dir_entry *)bh->b_data)) {
+                       EXT4_ERROR_INODE(dir, "checksumming directory "
+                                        "block %lu", (unsigned long)block);
+                       brelse(bh);
+                       *err = -EIO;
+                       goto errout;
+               }
+               set_buffer_verified(bh);
                retval = search_dirblock(bh, dir, d_name,
                                         block << EXT4_BLOCK_SIZE_BITS(sb),
                                         res_dir);
@@ -1037,6 +1330,12 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
+               if (unlikely(ino == dir->i_ino)) {
+                       EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
+                                        dentry->d_name.len,
+                                        dentry->d_name.name);
+                       return ERR_PTR(-EIO);
+               }
                inode = ext4_iget(dir->i_sb, ino);
                if (inode == ERR_PTR(-ESTALE)) {
                        EXT4_ERROR_INODE(dir,
@@ -1156,8 +1455,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        char *data1 = (*bh)->b_data, *data2;
        unsigned split, move, size;
        struct ext4_dir_entry_2 *de = NULL, *de2;
+       struct ext4_dir_entry_tail *t;
+       int     csum_size = 0;
        int     err = 0, i;
 
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+
        bh2 = ext4_append (handle, dir, &newblock, &err);
        if (!(bh2)) {
                brelse(*bh);
@@ -1204,10 +1509,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
        /* Fancy dance to stay within two buffers */
        de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
        de = dx_pack_dirents(data1, blocksize);
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+       de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+                                          (char *) de,
                                           blocksize);
-       de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+       de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+                                           (char *) de2,
                                            blocksize);
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(data2, blocksize);
+               initialize_dirent_tail(t, blocksize);
+
+               t = EXT4_DIRENT_TAIL(data1, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
 
@@ -1218,10 +1533,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                de = de2;
        }
        dx_insert_block(frame, hash2 + continued, newblock);
-       err = ext4_handle_dirty_metadata(handle, dir, bh2);
+       err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
        if (err)
                goto journal_error;
-       err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
+       err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
        if (err)
                goto journal_error;
        brelse(bh2);
@@ -1258,11 +1573,16 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
+       int             csum_size = 0;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
 
        reclen = EXT4_DIR_REC_LEN(namelen);
        if (!de) {
                de = (struct ext4_dir_entry_2 *)bh->b_data;
-               top = bh->b_data + blocksize - reclen;
+               top = bh->b_data + (blocksize - csum_size) - reclen;
                while ((char *) de <= top) {
                        if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                return -EIO;
@@ -1295,11 +1615,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
-       if (inode) {
-               de->inode = cpu_to_le32(inode->i_ino);
-               ext4_set_de_type(dir->i_sb, de, inode->i_mode);
-       } else
-               de->inode = 0;
+       de->inode = cpu_to_le32(inode->i_ino);
+       ext4_set_de_type(dir->i_sb, de, inode->i_mode);
        de->name_len = namelen;
        memcpy(de->name, name, namelen);
        /*
@@ -1318,7 +1635,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        dir->i_version++;
        ext4_mark_inode_dirty(handle, dir);
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_metadata(handle, dir, bh);
+       err = ext4_handle_dirty_dirent_node(handle, dir, bh);
        if (err)
                ext4_std_error(dir->i_sb, err);
        return 0;
@@ -1339,6 +1656,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        struct dx_frame frames[2], *frame;
        struct dx_entry *entries;
        struct ext4_dir_entry_2 *de, *de2;
+       struct ext4_dir_entry_tail *t;
        char            *data1, *top;
        unsigned        len;
        int             retval;
@@ -1346,6 +1664,11 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        struct dx_hash_info hinfo;
        ext4_lblk_t  block;
        struct fake_dirent *fde;
+       int             csum_size = 0;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
 
        blocksize =  dir->i_sb->s_blocksize;
        dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
@@ -1366,7 +1689,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                brelse(bh);
                return -EIO;
        }
-       len = ((char *) root) + blocksize - (char *) de;
+       len = ((char *) root) + (blocksize - csum_size) - (char *) de;
 
        /* Allocate new block for the 0th block's dirents */
        bh2 = ext4_append(handle, dir, &block, &retval);
@@ -1382,8 +1705,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        top = data1 + len;
        while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
                de = de2;
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+       de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+                                          (char *) de,
                                           blocksize);
+
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(data1, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
@@ -1408,8 +1738,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        frame->bh = bh;
        bh = bh2;
 
-       ext4_handle_dirty_metadata(handle, dir, frame->bh);
-       ext4_handle_dirty_metadata(handle, dir, bh);
+       ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+       ext4_handle_dirty_dirent_node(handle, dir, bh);
 
        de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
        if (!de) {
@@ -1445,11 +1775,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        struct inode *dir = dentry->d_parent->d_inode;
        struct buffer_head *bh;
        struct ext4_dir_entry_2 *de;
+       struct ext4_dir_entry_tail *t;
        struct super_block *sb;
        int     retval;
        int     dx_fallback=0;
        unsigned blocksize;
        ext4_lblk_t block, blocks;
+       int     csum_size = 0;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
 
        sb = dir->i_sb;
        blocksize = sb->s_blocksize;
@@ -1468,6 +1804,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                bh = ext4_bread(handle, dir, block, 0, &retval);
                if(!bh)
                        return retval;
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(dir,
+                               (struct ext4_dir_entry *)bh->b_data))
+                       return -EIO;
+               set_buffer_verified(bh);
                retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
                if (retval != -ENOSPC) {
                        brelse(bh);
@@ -1484,7 +1825,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-       de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+       de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
+
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
        if (retval == 0)
@@ -1516,6 +1863,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
                goto cleanup;
 
+       if (!buffer_verified(bh) &&
+           !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
+               goto journal_error;
+       set_buffer_verified(bh);
+
        BUFFER_TRACE(bh, "get_write_access");
        err = ext4_journal_get_write_access(handle, bh);
        if (err)
@@ -1583,7 +1935,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        dxtrace(dx_show_index("node", frames[1].entries));
                        dxtrace(dx_show_index("node",
                               ((struct dx_node *) bh2->b_data)->entries));
-                       err = ext4_handle_dirty_metadata(handle, dir, bh2);
+                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
                        if (err)
                                goto journal_error;
                        brelse (bh2);
@@ -1609,7 +1961,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        if (err)
                                goto journal_error;
                }
-               err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+               err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
                if (err) {
                        ext4_std_error(inode->i_sb, err);
                        goto cleanup;
@@ -1641,12 +1993,17 @@ static int ext4_delete_entry(handle_t *handle,
 {
        struct ext4_dir_entry_2 *de, *pde;
        unsigned int blocksize = dir->i_sb->s_blocksize;
+       int csum_size = 0;
        int i, err;
 
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+
        i = 0;
        pde = NULL;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
-       while (i < bh->b_size) {
+       while (i < bh->b_size - csum_size) {
                if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                        return -EIO;
                if (de == de_del)  {
@@ -1667,7 +2024,7 @@ static int ext4_delete_entry(handle_t *handle,
                                de->inode = 0;
                        dir->i_version++;
                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                       err = ext4_handle_dirty_metadata(handle, dir, bh);
+                       err = ext4_handle_dirty_dirent_node(handle, dir, bh);
                        if (unlikely(err)) {
                                ext4_std_error(dir->i_sb, err);
                                return err;
@@ -1809,9 +2166,15 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        struct inode *inode;
        struct buffer_head *dir_block = NULL;
        struct ext4_dir_entry_2 *de;
+       struct ext4_dir_entry_tail *t;
        unsigned int blocksize = dir->i_sb->s_blocksize;
+       int csum_size = 0;
        int err, retries = 0;
 
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+
        if (EXT4_DIR_LINK_MAX(dir))
                return -EMLINK;
 
@@ -1852,16 +2215,24 @@ retry:
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        de = ext4_next_entry(de, blocksize);
        de->inode = cpu_to_le32(dir->i_ino);
-       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+       de->rec_len = ext4_rec_len_to_disk(blocksize -
+                                          (csum_size + EXT4_DIR_REC_LEN(1)),
                                           blocksize);
        de->name_len = 2;
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
        set_nlink(inode, 2);
+
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
        BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_metadata(handle, inode, dir_block);
+       err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
        if (err)
                goto out_clear_inode;
+       set_buffer_verified(dir_block);
        err = ext4_mark_inode_dirty(handle, inode);
        if (!err)
                err = ext4_add_entry(handle, dentry, inode);
@@ -1911,6 +2282,14 @@ static int empty_dir(struct inode *inode)
                                     inode->i_ino);
                return 1;
        }
+       if (!buffer_verified(bh) &&
+           !ext4_dirent_csum_verify(inode,
+                       (struct ext4_dir_entry *)bh->b_data)) {
+               EXT4_ERROR_INODE(inode, "checksum error reading directory "
+                                "lblock 0");
+               return -EIO;
+       }
+       set_buffer_verified(bh);
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de1 = ext4_next_entry(de, sb->s_blocksize);
        if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -1942,6 +2321,14 @@ static int empty_dir(struct inode *inode)
                                offset += sb->s_blocksize;
                                continue;
                        }
+                       if (!buffer_verified(bh) &&
+                           !ext4_dirent_csum_verify(inode,
+                                       (struct ext4_dir_entry *)bh->b_data)) {
+                               EXT4_ERROR_INODE(inode, "checksum error "
+                                                "reading directory lblock 0");
+                               return -EIO;
+                       }
+                       set_buffer_verified(bh);
                        de = (struct ext4_dir_entry_2 *) bh->b_data;
                }
                if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
@@ -2010,7 +2397,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        /* Insert this inode at the head of the on-disk orphan list... */
        NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
        EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-       err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+       err = ext4_handle_dirty_super_now(handle, sb);
        rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
        if (!err)
                err = rc;
@@ -2083,7 +2470,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                if (err)
                        goto out_brelse;
                sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-               err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+               err = ext4_handle_dirty_super_now(handle, inode->i_sb);
        } else {
                struct ext4_iloc iloc2;
                struct inode *i_prev =
@@ -2442,6 +2829,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
                if (!dir_bh)
                        goto end_rename;
+               if (!buffer_verified(dir_bh) &&
+                   !ext4_dirent_csum_verify(old_inode,
+                               (struct ext4_dir_entry *)dir_bh->b_data))
+                       goto end_rename;
+               set_buffer_verified(dir_bh);
                if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
                                old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
                        goto end_rename;
@@ -2472,7 +2864,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                        ext4_current_time(new_dir);
                ext4_mark_inode_dirty(handle, new_dir);
                BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-               retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+               retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
                if (unlikely(retval)) {
                        ext4_std_error(new_dir->i_sb, retval);
                        goto end_rename;
@@ -2526,7 +2918,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-               retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
+               retval = ext4_handle_dirty_dirent_node(handle, old_inode,
+                                                      dir_bh);
                if (retval) {
                        ext4_std_error(old_dir->i_sb, retval);
                        goto end_rename;
index 59fa0be272516adf6cbbc94384106690bf710c65..7ea6cbb44121952bf0d4f81f914950ab284dba6b 100644 (file)
@@ -161,6 +161,8 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
        if (flex_gd == NULL)
                goto out3;
 
+       if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+               goto out2;
        flex_gd->count = flexbg_size;
 
        flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
@@ -796,7 +798,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
        ext4_kvfree(o_group_desc);
 
        le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-       err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+       err = ext4_handle_dirty_super_now(handle, sb);
        if (err)
                ext4_std_error(sb, err);
 
@@ -968,6 +970,8 @@ static void update_backups(struct super_block *sb,
                goto exit_err;
        }
 
+       ext4_superblock_csum_set(sb, (struct ext4_super_block *)data);
+
        while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
                struct buffer_head *bh;
 
@@ -1067,6 +1071,54 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
        return err;
 }
 
+static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
+{
+       struct buffer_head *bh = sb_getblk(sb, block);
+       if (!bh)
+               return NULL;
+
+       if (bitmap_uptodate(bh))
+               return bh;
+
+       lock_buffer(bh);
+       if (bh_submit_read(bh) < 0) {
+               unlock_buffer(bh);
+               brelse(bh);
+               return NULL;
+       }
+       unlock_buffer(bh);
+
+       return bh;
+}
+
+static int ext4_set_bitmap_checksums(struct super_block *sb,
+                                    ext4_group_t group,
+                                    struct ext4_group_desc *gdp,
+                                    struct ext4_new_group_data *group_data)
+{
+       struct buffer_head *bh;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 0;
+
+       bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
+       if (!bh)
+               return -EIO;
+       ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
+                                  EXT4_INODES_PER_GROUP(sb) / 8);
+       brelse(bh);
+
+       bh = ext4_get_bitmap(sb, group_data->block_bitmap);
+       if (!bh)
+               return -EIO;
+       ext4_block_bitmap_csum_set(sb, group, gdp, bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       brelse(bh);
+
+       return 0;
+}
+
 /*
  * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
  */
@@ -1093,18 +1145,24 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
                 */
                gdb_bh = sbi->s_group_desc[gdb_num];
                /* Update group descriptor block for new group */
-               gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+               gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
                                                 gdb_off * EXT4_DESC_SIZE(sb));
 
                memset(gdp, 0, EXT4_DESC_SIZE(sb));
                ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
                ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+               err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+               if (err) {
+                       ext4_std_error(sb, err);
+                       break;
+               }
+
                ext4_inode_table_set(sb, gdp, group_data->inode_table);
                ext4_free_group_clusters_set(sb, gdp,
                                             EXT4_B2C(sbi, group_data->free_blocks_count));
                ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
                gdp->bg_flags = cpu_to_le16(*bg_flags);
-               gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+               ext4_group_desc_csum_set(sb, group, gdp);
 
                err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
                if (unlikely(err)) {
@@ -1343,17 +1401,14 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
                           (1 + ext4_bg_num_gdb(sb, group + i) +
                            le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
                group_data[i].free_blocks_count = blocks_per_group - overhead;
-               if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                              EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+               if (ext4_has_group_desc_csum(sb))
                        flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
                                               EXT4_BG_INODE_UNINIT;
                else
                        flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
        }
 
-       if (last_group == n_group &&
-           EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                      EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+       if (last_group == n_group && ext4_has_group_desc_csum(sb))
                /* We need to initialize block bitmap of last group. */
                flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
 
index 35b5954489eeb88c6c5a29fd76fced5c3472e6f5..eb7aa3e4ef05caf136f24e0565a28e6d1e0a1539 100644 (file)
@@ -112,6 +112,48 @@ static struct file_system_type ext3_fs_type = {
 #define IS_EXT3_SB(sb) (0)
 #endif
 
+static int ext4_verify_csum_type(struct super_block *sb,
+                                struct ext4_super_block *es)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
+}
+
+static __le32 ext4_superblock_csum(struct super_block *sb,
+                                  struct ext4_super_block *es)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int offset = offsetof(struct ext4_super_block, s_checksum);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+
+       return cpu_to_le32(csum);
+}
+
+int ext4_superblock_csum_verify(struct super_block *sb,
+                               struct ext4_super_block *es)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       return es->s_checksum == ext4_superblock_csum(sb, es);
+}
+
+void ext4_superblock_csum_set(struct super_block *sb,
+                             struct ext4_super_block *es)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
 void *ext4_kvmalloc(size_t size, gfp_t flags)
 {
        void *ret;
@@ -497,6 +539,7 @@ void __ext4_error(struct super_block *sb, const char *function,
        printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
               sb->s_id, function, line, current->comm, &vaf);
        va_end(args);
+       save_error_info(sb, function, line);
 
        ext4_handle_error(sb);
 }
@@ -905,6 +948,8 @@ static void ext4_put_super(struct super_block *sb)
        unlock_super(sb);
        kobject_put(&sbi->s_kobj);
        wait_for_completion(&sbi->s_kobj_unregister);
+       if (sbi->s_chksum_driver)
+               crypto_free_shash(sbi->s_chksum_driver);
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
 }
@@ -1922,43 +1967,69 @@ failed:
        return 0;
 }
 
-__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
-                           struct ext4_group_desc *gdp)
+static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+                                  struct ext4_group_desc *gdp)
 {
+       int offset;
        __u16 crc = 0;
+       __le32 le_group = cpu_to_le32(block_group);
 
-       if (sbi->s_es->s_feature_ro_compat &
-           cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-               int offset = offsetof(struct ext4_group_desc, bg_checksum);
-               __le32 le_group = cpu_to_le32(block_group);
-
-               crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
-               crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
-               crc = crc16(crc, (__u8 *)gdp, offset);
-               offset += sizeof(gdp->bg_checksum); /* skip checksum */
-               /* for checksum of struct ext4_group_desc do the rest...*/
-               if ((sbi->s_es->s_feature_incompat &
-                    cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
-                   offset < le16_to_cpu(sbi->s_es->s_desc_size))
-                       crc = crc16(crc, (__u8 *)gdp + offset,
-                                   le16_to_cpu(sbi->s_es->s_desc_size) -
-                                       offset);
+       if ((sbi->s_es->s_feature_ro_compat &
+            cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+               /* Use new metadata_csum algorithm */
+               __u16 old_csum;
+               __u32 csum32;
+
+               old_csum = gdp->bg_checksum;
+               gdp->bg_checksum = 0;
+               csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+                                    sizeof(le_group));
+               csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
+                                    sbi->s_desc_size);
+               gdp->bg_checksum = old_csum;
+
+               crc = csum32 & 0xFFFF;
+               goto out;
        }
 
+       /* old crc16 code */
+       offset = offsetof(struct ext4_group_desc, bg_checksum);
+
+       crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+       crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+       crc = crc16(crc, (__u8 *)gdp, offset);
+       offset += sizeof(gdp->bg_checksum); /* skip checksum */
+       /* for checksum of struct ext4_group_desc do the rest...*/
+       if ((sbi->s_es->s_feature_incompat &
+            cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+           offset < le16_to_cpu(sbi->s_es->s_desc_size))
+               crc = crc16(crc, (__u8 *)gdp + offset,
+                           le16_to_cpu(sbi->s_es->s_desc_size) -
+                               offset);
+
+out:
        return cpu_to_le16(crc);
 }
 
-int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
                                struct ext4_group_desc *gdp)
 {
-       if ((sbi->s_es->s_feature_ro_compat &
-            cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
-           (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+       if (ext4_has_group_desc_csum(sb) &&
+           (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
+                                                     block_group, gdp)))
                return 0;
 
        return 1;
 }
 
+void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
+                             struct ext4_group_desc *gdp)
+{
+       if (!ext4_has_group_desc_csum(sb))
+               return;
+       gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+}
+
 /* Called at mount-time, super-block is locked */
 static int ext4_check_descriptors(struct super_block *sb,
                                  ext4_group_t *first_not_zeroed)
@@ -2013,7 +2084,7 @@ static int ext4_check_descriptors(struct super_block *sb,
                        return 0;
                }
                ext4_lock_group(sb, i);
-               if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
+               if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                        ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                 "Checksum for group %u failed (%u!=%u)",
                                 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
@@ -2417,6 +2488,23 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
        return count;
 }
 
+static ssize_t trigger_test_error(struct ext4_attr *a,
+                                 struct ext4_sb_info *sbi,
+                                 const char *buf, size_t count)
+{
+       int len = count;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (len && buf[len-1] == '\n')
+               len--;
+
+       if (len)
+               ext4_error(sbi->s_sb, "%.*s", len, buf);
+       return count;
+}
+
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {                  \
        .attr = {.name = __stringify(_name), .mode = _mode },   \
@@ -2447,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
 EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
 EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
 EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
 
 static struct attribute *ext4_attrs[] = {
        ATTR_LIST(delayed_allocation_blocks),
@@ -2461,6 +2550,7 @@ static struct attribute *ext4_attrs[] = {
        ATTR_LIST(mb_stream_req),
        ATTR_LIST(mb_group_prealloc),
        ATTR_LIST(max_writeback_mb_bump),
+       ATTR_LIST(trigger_fs_error),
        NULL,
 };
 
@@ -2957,6 +3047,44 @@ static void ext4_destroy_lazyinit_thread(void)
        kthread_stop(ext4_lazyinit_task);
 }
 
+static int set_journal_csum_feature_set(struct super_block *sb)
+{
+       int ret = 1;
+       int compat, incompat;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               /* journal checksum v2 */
+               compat = 0;
+               incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+       } else {
+               /* journal checksum v1 */
+               compat = JBD2_FEATURE_COMPAT_CHECKSUM;
+               incompat = 0;
+       }
+
+       if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+               ret = jbd2_journal_set_features(sbi->s_journal,
+                               compat, 0,
+                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+                               incompat);
+       } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+               ret = jbd2_journal_set_features(sbi->s_journal,
+                               compat, 0,
+                               incompat);
+               jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+       } else {
+               jbd2_journal_clear_features(sbi->s_journal,
+                               JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+                               JBD2_FEATURE_INCOMPAT_CSUM_V2);
+       }
+
+       return ret;
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
        char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -2993,6 +3121,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto out_free_orig;
        }
        sb->s_fs_info = sbi;
+       sbi->s_sb = sb;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
        sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
@@ -3032,13 +3161,54 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * Note: s_es must be initialized as soon as possible because
         *       some ext4 macro-instructions depend on its value
         */
-       es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+       es = (struct ext4_super_block *) (bh->b_data + offset);
        sbi->s_es = es;
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC)
                goto cantfind_ext4;
        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
 
+       /* Warn if metadata_csum and gdt_csum are both set. */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+           EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+               ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+                            "redundant flags; please run fsck.");
+
+       /* Check for a known checksum algorithm */
+       if (!ext4_verify_csum_type(sb, es)) {
+               ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+                        "unknown checksum algorithm.");
+               silent = 1;
+               goto cantfind_ext4;
+       }
+
+       /* Load the checksum driver */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+               if (IS_ERR(sbi->s_chksum_driver)) {
+                       ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+                       ret = PTR_ERR(sbi->s_chksum_driver);
+                       sbi->s_chksum_driver = NULL;
+                       goto failed_mount;
+               }
+       }
+
+       /* Check superblock checksum */
+       if (!ext4_superblock_csum_verify(sb, es)) {
+               ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+                        "invalid superblock checksum.  Run e2fsck?");
+               silent = 1;
+               goto cantfind_ext4;
+       }
+
+       /* Precompute checksum seed for all metadata */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+                                              sizeof(es->s_uuid));
+
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
        set_opt(sb, INIT_INODE_TABLE);
@@ -3200,7 +3370,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                               "Can't read superblock on 2nd try");
                        goto failed_mount;
                }
-               es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+               es = (struct ext4_super_block *)(bh->b_data + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
                        ext4_msg(sb, KERN_ERR,
@@ -3392,6 +3562,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                          GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
                ext4_msg(sb, KERN_ERR, "not enough memory");
+               ret = -ENOMEM;
                goto failed_mount;
        }
 
@@ -3449,6 +3620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
        if (err) {
                ext4_msg(sb, KERN_ERR, "insufficient memory");
+               ret = err;
                goto failed_mount3;
        }
 
@@ -3506,26 +3678,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto no_journal;
        }
 
-       if (ext4_blocks_count(es) > 0xffffffffULL &&
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
                ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto failed_mount_wq;
        }
 
-       if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-               jbd2_journal_set_features(sbi->s_journal,
-                               JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-       } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
-               jbd2_journal_set_features(sbi->s_journal,
-                               JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
-               jbd2_journal_clear_features(sbi->s_journal, 0, 0,
-                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-       } else {
-               jbd2_journal_clear_features(sbi->s_journal,
-                               JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+       if (!set_journal_csum_feature_set(sb)) {
+               ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+                        "feature set");
+               goto failed_mount_wq;
        }
 
        /* We have now updated the journal if required, so we can
@@ -3606,7 +3769,8 @@ no_journal:
                goto failed_mount4;
        }
 
-       ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
+       if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+               sb->s_flags |= MS_RDONLY;
 
        /* determine the minimum size of new large inodes, if present */
        if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -3641,7 +3805,7 @@ no_journal:
        }
 
        ext4_ext_init(sb);
-       err = ext4_mb_init(sb, needs_recovery);
+       err = ext4_mb_init(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                         err);
@@ -3724,6 +3888,8 @@ failed_mount2:
                brelse(sbi->s_group_desc[i]);
        ext4_kvfree(sbi->s_group_desc);
 failed_mount:
+       if (sbi->s_chksum_driver)
+               crypto_free_shash(sbi->s_chksum_driver);
        if (sbi->s_proc) {
                remove_proc_entry("options", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -3847,7 +4013,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
                goto out_bdev;
        }
 
-       es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+       es = (struct ext4_super_block *) (bh->b_data + offset);
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -4039,6 +4205,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                                &EXT4_SB(sb)->s_freeinodes_counter));
        sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
+       ext4_superblock_csum_set(sb, es);
        mark_buffer_dirty(sbh);
        if (sync) {
                error = sync_dirty_buffer(sbh);
@@ -4333,7 +4500,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);
 
-                               if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+                               if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                        ext4_msg(sb, KERN_ERR,
               "ext4_remount: Checksum for group %u failed (%u!=%u)",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
index e88748e55c0f246e90ca21c2094303719f83df07..e56c9ed7d6e30d523b7f8e4b638f9190427cf50d 100644 (file)
@@ -122,6 +122,58 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
        NULL
 };
 
+static __le32 ext4_xattr_block_csum(struct inode *inode,
+                                   sector_t block_nr,
+                                   struct ext4_xattr_header *hdr)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       __u32 csum, old;
+
+       old = hdr->h_checksum;
+       hdr->h_checksum = 0;
+       if (le32_to_cpu(hdr->h_refcount) != 1) {
+               block_nr = cpu_to_le64(block_nr);
+               csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
+                                  sizeof(block_nr));
+       } else
+               csum = ei->i_csum_seed;
+       csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
+                          EXT4_BLOCK_SIZE(inode->i_sb));
+       hdr->h_checksum = old;
+       return cpu_to_le32(csum);
+}
+
+static int ext4_xattr_block_csum_verify(struct inode *inode,
+                                       sector_t block_nr,
+                                       struct ext4_xattr_header *hdr)
+{
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+           (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
+               return 0;
+       return 1;
+}
+
+static void ext4_xattr_block_csum_set(struct inode *inode,
+                                     sector_t block_nr,
+                                     struct ext4_xattr_header *hdr)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
+}
+
+static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
+                                               struct inode *inode,
+                                               struct buffer_head *bh)
+{
+       ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
+       return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
 static inline const struct xattr_handler *
 ext4_xattr_handler(int name_index)
 {
@@ -156,12 +208,22 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
 }
 
 static inline int
-ext4_xattr_check_block(struct buffer_head *bh)
+ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
 {
+       int error;
+
+       if (buffer_verified(bh))
+               return 0;
+
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1))
                return -EIO;
-       return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+       if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
+               return -EIO;
+       error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+       if (!error)
+               set_buffer_verified(bh);
+       return error;
 }
 
 static inline int
@@ -224,7 +286,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                goto cleanup;
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-       if (ext4_xattr_check_block(bh)) {
+       if (ext4_xattr_check_block(inode, bh)) {
 bad_block:
                EXT4_ERROR_INODE(inode, "bad block %llu",
                                 EXT4_I(inode)->i_file_acl);
@@ -369,7 +431,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                goto cleanup;
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-       if (ext4_xattr_check_block(bh)) {
+       if (ext4_xattr_check_block(inode, bh)) {
                EXT4_ERROR_INODE(inode, "bad block %llu",
                                 EXT4_I(inode)->i_file_acl);
                error = -EIO;
@@ -492,7 +554,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                if (ce)
                        mb_cache_entry_release(ce);
                unlock_buffer(bh);
-               error = ext4_handle_dirty_metadata(handle, inode, bh);
+               error = ext4_handle_dirty_xattr_block(handle, inode, bh);
                if (IS_SYNC(inode))
                        ext4_handle_sync(handle);
                dquot_free_block(inode, 1);
@@ -662,7 +724,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
-               if (ext4_xattr_check_block(bs->bh)) {
+               if (ext4_xattr_check_block(inode, bs->bh)) {
                        EXT4_ERROR_INODE(inode, "bad block %llu",
                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
@@ -725,9 +787,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                        if (error == -EIO)
                                goto bad_block;
                        if (!error)
-                               error = ext4_handle_dirty_metadata(handle,
-                                                                  inode,
-                                                                  bs->bh);
+                               error = ext4_handle_dirty_xattr_block(handle,
+                                                                     inode,
+                                                                     bs->bh);
                        if (error)
                                goto cleanup;
                        goto inserted;
@@ -796,9 +858,9 @@ inserted:
                                ea_bdebug(new_bh, "reusing; refcount now=%d",
                                        le32_to_cpu(BHDR(new_bh)->h_refcount));
                                unlock_buffer(new_bh);
-                               error = ext4_handle_dirty_metadata(handle,
-                                                                  inode,
-                                                                  new_bh);
+                               error = ext4_handle_dirty_xattr_block(handle,
+                                                                     inode,
+                                                                     new_bh);
                                if (error)
                                        goto cleanup_dquot;
                        }
@@ -855,8 +917,8 @@ getblk_failed:
                        set_buffer_uptodate(new_bh);
                        unlock_buffer(new_bh);
                        ext4_xattr_cache_insert(new_bh);
-                       error = ext4_handle_dirty_metadata(handle,
-                                                          inode, new_bh);
+                       error = ext4_handle_dirty_xattr_block(handle,
+                                                             inode, new_bh);
                        if (error)
                                goto cleanup;
                }
@@ -1193,7 +1255,7 @@ retry:
                error = -EIO;
                if (!bh)
                        goto cleanup;
-               if (ext4_xattr_check_block(bh)) {
+               if (ext4_xattr_check_block(inode, bh)) {
                        EXT4_ERROR_INODE(inode, "bad block %llu",
                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
index 25b7387ff183f880cdb9ccaf2529ca8c0f218a7b..91f31ca7d9af9df24a965c64bb0271c43a4d4b09 100644 (file)
@@ -27,7 +27,9 @@ struct ext4_xattr_header {
        __le32  h_refcount;     /* reference count */
        __le32  h_blocks;       /* number of disk blocks used */
        __le32  h_hash;         /* hash value of all attributes */
-       __u32   h_reserved[4];  /* zero right now */
+       __le32  h_checksum;     /* crc32c(uuid+id+xattrblock) */
+                               /* id = inum if refcount=1, blknum otherwise */
+       __u32   h_reserved[3];  /* zero right now */
 };
 
 struct ext4_xattr_ibody_header {
index c2973ea5df9ab57ccd6ee6a44dc2c13ebb22ab35..a3d81ebf6d864a8c2189147e5771c435473b2e42 100644 (file)
@@ -735,10 +735,9 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
 }
 
 static int
-fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
+fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
 {
        int len = *lenp;
-       struct inode *inode =  de->d_inode;
        u32 ipos_h, ipos_m, ipos_l;
 
        if (len < 5) {
@@ -754,9 +753,9 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
        fh[1] = inode->i_generation;
        fh[2] = ipos_h;
        fh[3] = ipos_m | MSDOS_I(inode)->i_logstart;
-       spin_lock(&de->d_lock);
-       fh[4] = ipos_l | MSDOS_I(de->d_parent->d_inode)->i_logstart;
-       spin_unlock(&de->d_lock);
+       fh[4] = ipos_l;
+       if (parent)
+               fh[4] |= MSDOS_I(parent)->i_logstart;
        return 3;
 }
 
index d078b75572a75eb9117092ee5bb752c84e1b38b8..81b70e665bf000412f73aa300890a53823db36f0 100644 (file)
@@ -442,28 +442,24 @@ static int check_fcntl_cmd(unsigned cmd)
 SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
 {      
        struct file *filp;
+       int fput_needed;
        long err = -EBADF;
 
-       filp = fget_raw(fd);
+       filp = fget_raw_light(fd, &fput_needed);
        if (!filp)
                goto out;
 
        if (unlikely(filp->f_mode & FMODE_PATH)) {
-               if (!check_fcntl_cmd(cmd)) {
-                       fput(filp);
-                       goto out;
-               }
+               if (!check_fcntl_cmd(cmd))
+                       goto out1;
        }
 
        err = security_file_fcntl(filp, cmd, arg);
-       if (err) {
-               fput(filp);
-               return err;
-       }
+       if (!err)
+               err = do_fcntl(fd, cmd, arg, filp);
 
-       err = do_fcntl(fd, cmd, arg, filp);
-
-       fput(filp);
+out1:
+       fput_light(filp, fput_needed);
 out:
        return err;
 }
@@ -473,26 +469,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                unsigned long, arg)
 {      
        struct file * filp;
-       long err;
+       long err = -EBADF;
+       int fput_needed;
 
-       err = -EBADF;
-       filp = fget_raw(fd);
+       filp = fget_raw_light(fd, &fput_needed);
        if (!filp)
                goto out;
 
        if (unlikely(filp->f_mode & FMODE_PATH)) {
-               if (!check_fcntl_cmd(cmd)) {
-                       fput(filp);
-                       goto out;
-               }
+               if (!check_fcntl_cmd(cmd))
+                       goto out1;
        }
 
        err = security_file_fcntl(filp, cmd, arg);
-       if (err) {
-               fput(filp);
-               return err;
-       }
-       err = -EBADF;
+       if (err)
+               goto out1;
        
        switch (cmd) {
                case F_GETLK64:
@@ -507,7 +498,8 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                        err = do_fcntl(fd, cmd, arg, filp);
                        break;
        }
-       fput(filp);
+out1:
+       fput_light(filp, fput_needed);
 out:
        return err;
 }
index 70f2a0fd6aec62b28724d46e356dc0ff871f88b8..a305d9e2d1b2aac05dcd456bdd23885652272439 100644 (file)
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
        .max_files = NR_FILE
 };
 
-DECLARE_LGLOCK(files_lglock);
 DEFINE_LGLOCK(files_lglock);
 
 /* SLAB cache for file structures */
@@ -421,9 +420,9 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
  */
 void file_sb_list_add(struct file *file, struct super_block *sb)
 {
-       lg_local_lock(files_lglock);
+       lg_local_lock(&files_lglock);
        __file_sb_list_add(file, sb);
-       lg_local_unlock(files_lglock);
+       lg_local_unlock(&files_lglock);
 }
 
 /**
@@ -436,9 +435,9 @@ void file_sb_list_add(struct file *file, struct super_block *sb)
 void file_sb_list_del(struct file *file)
 {
        if (!list_empty(&file->f_u.fu_list)) {
-               lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+               lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
                list_del_init(&file->f_u.fu_list);
-               lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+               lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
        }
 }
 
@@ -485,7 +484,7 @@ void mark_files_ro(struct super_block *sb)
        struct file *f;
 
 retry:
-       lg_global_lock(files_lglock);
+       lg_global_lock(&files_lglock);
        do_file_list_for_each_entry(sb, f) {
                struct vfsmount *mnt;
                if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -502,12 +501,12 @@ retry:
                file_release_write(f);
                mnt = mntget(f->f_path.mnt);
                /* This can sleep, so we can't hold the spinlock. */
-               lg_global_unlock(files_lglock);
+               lg_global_unlock(&files_lglock);
                mnt_drop_write(mnt);
                mntput(mnt);
                goto retry;
        } while_file_list_for_each_entry;
-       lg_global_unlock(files_lglock);
+       lg_global_unlock(&files_lglock);
 }
 
 void __init files_init(unsigned long mempages)
@@ -525,6 +524,6 @@ void __init files_init(unsigned long mempages)
        n = (mempages * (PAGE_SIZE / 1024)) / 10;
        files_stat.max_files = max_t(unsigned long, n, NR_FILE);
        files_defer_init();
-       lg_lock_init(files_lglock);
+       lg_lock_init(&files_lglock, "files_lglock");
        percpu_counter_init(&nr_files, 0);
 } 
index 504e61b7fd7515f8aafe7e3b9edd2c9fa42fd91d..9562109d3a879b3dab50ee27d989f3ae89c8b833 100644 (file)
@@ -962,7 +962,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                goto out;
 
-       file_update_time(file);
+       err = file_update_time(file);
+       if (err)
+               goto out;
 
        if (file->f_flags & O_DIRECT) {
                written = generic_file_direct_write(iocb, iov, &nr_segs,
index 56f6dcf307684287bad491b8711fa3a4ef4f0633..42678a33b7bb6297ced300f7fbb61696d37628c9 100644 (file)
@@ -627,12 +627,10 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
        return ERR_PTR(err);
 }
 
-static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
-                          int connectable)
+static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+                          struct inode *parent)
 {
-       struct inode *inode = dentry->d_inode;
-       bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
-       int len = encode_parent ? 6 : 3;
+       int len = parent ? 6 : 3;
        u64 nodeid;
        u32 generation;
 
@@ -648,14 +646,9 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        fh[1] = (u32)(nodeid & 0xffffffff);
        fh[2] = generation;
 
-       if (encode_parent) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                nodeid = get_fuse_inode(parent)->nodeid;
                generation = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
 
                fh[3] = (u32)(nodeid >> 32);
                fh[4] = (u32)(nodeid & 0xffffffff);
@@ -663,7 +656,7 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        }
 
        *max_len = len;
-       return encode_parent ? 0x82 : 0x81;
+       return parent ? 0x82 : 0x81;
 }
 
 static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
index 70ba891654f8ce3582c456e208feda6d56e90a1a..e8ed6d4a6181132ff47960dc118cd6fb60c1b81c 100644 (file)
 #define GFS2_LARGE_FH_SIZE 8
 #define GFS2_OLD_FH_SIZE 10
 
-static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
-                         int connectable)
+static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
+                         struct inode *parent)
 {
        __be32 *fh = (__force __be32 *)p;
-       struct inode *inode = dentry->d_inode;
        struct super_block *sb = inode->i_sb;
        struct gfs2_inode *ip = GFS2_I(inode);
 
-       if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+       if (parent && (*len < GFS2_LARGE_FH_SIZE)) {
                *len = GFS2_LARGE_FH_SIZE;
                return 255;
        } else if (*len < GFS2_SMALL_FH_SIZE) {
@@ -50,14 +49,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
        fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
        *len = GFS2_SMALL_FH_SIZE;
 
-       if (!connectable || inode == sb->s_root->d_inode)
+       if (!parent || inode == sb->s_root->d_inode)
                return *len;
 
-       spin_lock(&dentry->d_lock);
-       inode = dentry->d_parent->d_inode;
-       ip = GFS2_I(inode);
-       igrab(inode);
-       spin_unlock(&dentry->d_lock);
+       ip = GFS2_I(parent);
 
        fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
        fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
@@ -65,8 +60,6 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
        fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
        *len = GFS2_LARGE_FH_SIZE;
 
-       iput(inode);
-
        return *len;
 }
 
index 7a5eb2c718c854206d6db419abe4ce7bc61d12c7..cdb84a8380682b5f341138cb6f75e2754434e073 100644 (file)
@@ -16,9 +16,9 @@
 static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
 {
        struct quad_buffer_head qbh;
-       u32 *bmp;
+       __le32 *bmp;
        if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail;
-       if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
+       if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
                hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec);
                goto fail1;
        }
@@ -62,7 +62,7 @@ int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg)
 static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward)
 {
        struct quad_buffer_head qbh;
-       unsigned *bmp;
+       __le32 *bmp;
        unsigned bs = near & ~0x3fff;
        unsigned nr = (near & 0x3fff) & ~(n - 1);
        /*unsigned mnr;*/
@@ -236,7 +236,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
 int hpfs_alloc_if_possible(struct super_block *s, secno sec)
 {
        struct quad_buffer_head qbh;
-       u32 *bmp;
+       __le32 *bmp;
        if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end;
        if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) {
                bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
@@ -254,7 +254,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
 void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
 {
        struct quad_buffer_head qbh;
-       u32 *bmp;
+       __le32 *bmp;
        struct hpfs_sb_info *sbi = hpfs_sb(s);
        /*printk("2 - ");*/
        if (!n) return;
@@ -299,7 +299,7 @@ int hpfs_check_free_dnodes(struct super_block *s, int n)
        int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14;
        int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff;
        int i, j;
-       u32 *bmp;
+       __le32 *bmp;
        struct quad_buffer_head qbh;
        if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
                for (j = 0; j < 512; j++) {
@@ -351,7 +351,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
                hpfs_free_sectors(s, dno, 4);
        } else {
                struct quad_buffer_head qbh;
-               u32 *bmp;
+               __le32 *bmp;
                unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4;
                if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
                        return;
index 08b503e8ed29ec610a098cb9658e1a2ecaa1779c..4bae4a4a60b1936eba70d17d18e7d4a016ed54b9 100644 (file)
@@ -20,7 +20,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
        int c1, c2 = 0;
        go_down:
        if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
-       if (btree->internal) {
+       if (bp_internal(btree)) {
                for (i = 0; i < btree->n_used_nodes; i++)
                        if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) {
                                a = le32_to_cpu(btree->u.internal[i].down);
@@ -82,7 +82,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                brelse(bh);
                return -1;
        }
-       if (btree->internal) {
+       if (bp_internal(btree)) {
                a = le32_to_cpu(btree->u.internal[n].down);
                btree->u.internal[n].file_secno = cpu_to_le32(-1);
                mark_buffer_dirty(bh);
@@ -129,12 +129,12 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                }
                if (a == node && fnod) {
                        anode->up = cpu_to_le32(node);
-                       anode->btree.fnode_parent = 1;
+                       anode->btree.flags |= BP_fnode_parent;
                        anode->btree.n_used_nodes = btree->n_used_nodes;
                        anode->btree.first_free = btree->first_free;
                        anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes;
                        memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12);
-                       btree->internal = 1;
+                       btree->flags |= BP_internal;
                        btree->n_free_nodes = 11;
                        btree->n_used_nodes = 1;
                        btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree);
@@ -184,7 +184,10 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                        hpfs_free_sectors(s, ra, 1);
                        if ((anode = hpfs_map_anode(s, na, &bh))) {
                                anode->up = cpu_to_le32(up);
-                               anode->btree.fnode_parent = up == node && fnod;
+                               if (up == node && fnod)
+                                       anode->btree.flags |= BP_fnode_parent;
+                               else
+                                       anode->btree.flags &= ~BP_fnode_parent;
                                mark_buffer_dirty(bh);
                                brelse(bh);
                        }
@@ -198,7 +201,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) {
                        anode = new_anode;
                        /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/
-                       anode->btree.internal = 1;
+                       anode->btree.flags |= BP_internal;
                        anode->btree.n_used_nodes = 1;
                        anode->btree.n_free_nodes = 59;
                        anode->btree.first_free = cpu_to_le16(16);
@@ -215,7 +218,8 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
        }
        if ((anode = hpfs_map_anode(s, na, &bh))) {
                anode->up = cpu_to_le32(node);
-               if (fnod) anode->btree.fnode_parent = 1;
+               if (fnod)
+                       anode->btree.flags |= BP_fnode_parent;
                mark_buffer_dirty(bh);
                brelse(bh);
        }
@@ -234,18 +238,19 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
        }
        ranode->up = cpu_to_le32(node);
        memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free));
-       if (fnod) ranode->btree.fnode_parent = 1;
-       ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes;
-       if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
+       if (fnod)
+               ranode->btree.flags |= BP_fnode_parent;
+       ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes;
+       if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
                struct anode *unode;
                if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) {
                        unode->up = cpu_to_le32(ra);
-                       unode->btree.fnode_parent = 0;
+                       unode->btree.flags &= ~BP_fnode_parent;
                        mark_buffer_dirty(bh1);
                        brelse(bh1);
                }
        }
-       btree->internal = 1;
+       btree->flags |= BP_internal;
        btree->n_free_nodes = fnod ? 10 : 58;
        btree->n_used_nodes = 2;
        btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree);
@@ -278,7 +283,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
        int d1, d2;
        go_down:
        d2 = 0;
-       while (btree1->internal) {
+       while (bp_internal(btree1)) {
                ano = le32_to_cpu(btree1->u.internal[pos].down);
                if (level) brelse(bh);
                if (hpfs_sb(s)->sb_chk)
@@ -412,13 +417,13 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs)
                        btree->n_free_nodes = 8;
                        btree->n_used_nodes = 0;
                        btree->first_free = cpu_to_le16(8);
-                       btree->internal = 0;
+                       btree->flags &= ~BP_internal;
                        mark_buffer_dirty(bh);
                } else hpfs_free_sectors(s, f, 1);
                brelse(bh);
                return;
        }
-       while (btree->internal) {
+       while (bp_internal(btree)) {
                nodes = btree->n_used_nodes + btree->n_free_nodes;
                for (i = 0; i < btree->n_used_nodes; i++)
                        if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f;
@@ -479,13 +484,13 @@ void hpfs_remove_fnode(struct super_block *s, fnode_secno fno)
        struct extended_attribute *ea;
        struct extended_attribute *ea_end;
        if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return;
-       if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree);
+       if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree);
        else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno));
        ea_end = fnode_end_ea(fnode);
        for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
-               if (ea->indirect)
-                       hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
-       hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l));
+               if (ea_indirect(ea))
+                       hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
+       hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l));
        brelse(bh);
        hpfs_free_sectors(s, fno, 1);
 }
index 2fa0089a02a8ec2934cda55cbbae18e50c34a4ea..b8472f803f4e54ea5039b85ac36cfdf33a48925b 100644 (file)
@@ -87,7 +87,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        ret = -EIOERROR;
                        goto out;
                }
-               if (!fno->dirflag) {
+               if (!fnode_is_dir(fno)) {
                        e = 1;
                        hpfs_error(inode->i_sb, "not a directory, fnode %08lx",
                                        (unsigned long)inode->i_ino);
index 1e0e2ac30fd3be93f8e5b7a97618f19a52220ec4..3228c524ebe56f948d8896cec23ca6b1284f6303 100644 (file)
@@ -153,7 +153,7 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
                }
                de->length = cpu_to_le16(36);
                de->down = 1;
-               *(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr);
+               *(__le32 *)((char *)de + 32) = cpu_to_le32(ptr);
        }
 }
 
@@ -177,7 +177,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
        memmove((char *)de + d_size, de, (char *)de_end - (char *)de);
        memset(de, 0, d_size);
        if (down_ptr) {
-               *(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
+               *(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
                de->down = 1;
        }
        de->length = cpu_to_le16(d_size);
@@ -656,7 +656,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
                                del->down = 0;
                                d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4);
                        } else if (down)
-                               *(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
+                               *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
                } else goto endm;
                if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) {
                        printk("HPFS: out of memory for dtree balancing\n");
@@ -672,7 +672,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
                        de_prev->down = 1;
                        dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4);
                }
-               *(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
+               *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
                hpfs_mark_4buffers_dirty(&qbh);
                hpfs_brelse4(&qbh);
                for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4);
@@ -1015,7 +1015,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
                kfree(name2);
                return NULL;
        }       
-       if (!upf->dirflag) {
+       if (!fnode_is_dir(upf)) {
                brelse(bh);
                hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up));
                kfree(name2);
index d8b84d113c891bbcfd8416d3f35153983b0549a7..bcaafcd2666ac275d02c2f054023cc537ebd7644 100644 (file)
@@ -23,15 +23,15 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len)
                        return;
                }
                if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
-               if (ea->indirect) {
+               if (ea_indirect(ea)) {
                        if (ea_valuelen(ea) != 8) {
-                               hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x",
+                               hpfs_error(s, "ea_indirect(ea) set while ea->valuelen!=8, %s %08x, pos %08x",
                                        ano ? "anode" : "sectors", a, pos);
                                return;
                        }
                        if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4))
                                return;
-                       hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
+                       hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
                }
                pos += ea->namelen + ea_valuelen(ea) + 5;
        }
@@ -81,7 +81,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
        struct extended_attribute *ea_end = fnode_end_ea(fnode);
        for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
                if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
+                       if (ea_indirect(ea))
                                goto indirect;
                        if (ea_valuelen(ea) >= size)
                                return -EINVAL;
@@ -91,7 +91,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
                }
        a = le32_to_cpu(fnode->ea_secno);
        len = le32_to_cpu(fnode->ea_size_l);
-       ano = fnode->ea_anode;
+       ano = fnode_in_anode(fnode);
        pos = 0;
        while (pos < len) {
                ea = (struct extended_attribute *)ex;
@@ -101,10 +101,10 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
                        return -EIO;
                }
                if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO;
-               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
                        return -EIO;
                if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
+                       if (ea_indirect(ea))
                                goto indirect;
                        if (ea_valuelen(ea) >= size)
                                return -EINVAL;
@@ -119,7 +119,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
 indirect:
        if (ea_len(ea) >= size)
                return -EINVAL;
-       if (hpfs_ea_read(s, ea_sec(ea), ea->anode, 0, ea_len(ea), buf))
+       if (hpfs_ea_read(s, ea_sec(ea), ea_in_anode(ea), 0, ea_len(ea), buf))
                return -EIO;
        buf[ea_len(ea)] = 0;
        return 0;
@@ -136,8 +136,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
        struct extended_attribute *ea_end = fnode_end_ea(fnode);
        for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
                if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
-                               return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+                       if (ea_indirect(ea))
+                               return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
                        if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
                                printk("HPFS: out of memory for EA\n");
                                return NULL;
@@ -148,7 +148,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
                }
        a = le32_to_cpu(fnode->ea_secno);
        len = le32_to_cpu(fnode->ea_size_l);
-       ano = fnode->ea_anode;
+       ano = fnode_in_anode(fnode);
        pos = 0;
        while (pos < len) {
                char ex[4 + 255 + 1 + 8];
@@ -159,11 +159,11 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
                        return NULL;
                }
                if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL;
-               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
                        return NULL;
                if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
-                               return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+                       if (ea_indirect(ea))
+                               return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
                        if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
                                printk("HPFS: out of memory for EA\n");
                                return NULL;
@@ -199,9 +199,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
        struct extended_attribute *ea_end = fnode_end_ea(fnode);
        for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
                if (!strcmp(ea->name, key)) {
-                       if (ea->indirect) {
+                       if (ea_indirect(ea)) {
                                if (ea_len(ea) == size)
-                                       set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+                                       set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
                        } else if (ea_valuelen(ea) == size) {
                                memcpy(ea_data(ea), data, size);
                        }
@@ -209,7 +209,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                }
        a = le32_to_cpu(fnode->ea_secno);
        len = le32_to_cpu(fnode->ea_size_l);
-       ano = fnode->ea_anode;
+       ano = fnode_in_anode(fnode);
        pos = 0;
        while (pos < len) {
                char ex[4 + 255 + 1 + 8];
@@ -220,12 +220,12 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                        return;
                }
                if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
-               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
                        return;
                if (!strcmp(ea->name, key)) {
-                       if (ea->indirect) {
+                       if (ea_indirect(ea)) {
                                if (ea_len(ea) == size)
-                                       set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+                                       set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
                        }
                        else {
                                if (ea_valuelen(ea) == size)
@@ -246,7 +246,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
        if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) {
                hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x",
                        (unsigned long)inode->i_ino,
-                       le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
+                       le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
                return;
        }
        if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) &&
@@ -276,7 +276,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s));
                fnode->ea_size_s = cpu_to_le16(0);
                fnode->ea_secno = cpu_to_le32(n);
-               fnode->ea_anode = cpu_to_le32(0);
+               fnode->flags &= ~FNODE_anode;
                mark_buffer_dirty(bh);
                brelse(bh);
        }
@@ -288,9 +288,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                        secno q = hpfs_alloc_sector(s, fno, 1, 0);
                        if (!q) goto bail;
                        fnode->ea_secno = cpu_to_le32(q);
-                       fnode->ea_anode = 0;
+                       fnode->flags &= ~FNODE_anode;
                        len++;
-               } else if (!fnode->ea_anode) {
+               } else if (!fnode_in_anode(fnode)) {
                        if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) {
                                len++;
                        } else {
@@ -310,7 +310,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                                anode->u.external[0].length = cpu_to_le32(len);
                                mark_buffer_dirty(bh);
                                brelse(bh);
-                               fnode->ea_anode = 1;
+                               fnode->flags |= FNODE_anode;
                                fnode->ea_secno = cpu_to_le32(a_s);*/
                                secno new_sec;
                                int i;
@@ -338,7 +338,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                                len = (pos + 511) >> 9;
                        }
                }
-               if (fnode->ea_anode) {
+               if (fnode_in_anode(fnode)) {
                        if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno),
                                                     0, len) != -1) {
                                len++;
@@ -351,16 +351,16 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
        h[1] = strlen(key);
        h[2] = size & 0xff;
        h[3] = size >> 8;
-       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
-       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
-       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
+       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
+       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
+       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
        fnode->ea_size_l = cpu_to_le32(pos);
        ret:
        hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size;
        return;
        bail:
        if (le32_to_cpu(fnode->ea_secno))
-               if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
+               if (fnode_in_anode(fnode)) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
                else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9));
        else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0);
 }
index 8b0650aae32812bac9abbb581439b592d64949d2..cce025aff1b19b86f824cd47bfb6c9457d583068 100644 (file)
@@ -51,11 +51,11 @@ struct hpfs_boot_block
   u8 n_rootdir_entries[2];
   u8 n_sectors_s[2];
   u8 media_byte;
-  u16 sectors_per_fat;
-  u16 sectors_per_track;
-  u16 heads_per_cyl;
-  u32 n_hidden_sectors;
-  u32 n_sectors_l;             /* size of partition */
+  __le16 sectors_per_fat;
+  __le16 sectors_per_track;
+  __le16 heads_per_cyl;
+  __le32 n_hidden_sectors;
+  __le32 n_sectors_l;          /* size of partition */
   u8 drive_number;
   u8 mbz;
   u8 sig_28h;                  /* 28h */
@@ -63,7 +63,7 @@ struct hpfs_boot_block
   u8 vol_label[11];
   u8 sig_hpfs[8];              /* "HPFS    " */
   u8 pad[448];
-  u16 magic;                   /* aa55 */
+  __le16 magic;                        /* aa55 */
 };
 
 
@@ -75,28 +75,28 @@ struct hpfs_boot_block
 
 struct hpfs_super_block
 {
-  u32 magic;                           /* f995 e849 */
-  u32 magic1;                          /* fa53 e9c5, more magic? */
+  __le32 magic;                                /* f995 e849 */
+  __le32 magic1;                       /* fa53 e9c5, more magic? */
   u8 version;                          /* version of a filesystem  usually 2 */
   u8 funcversion;                      /* functional version - oldest version
                                           of filesystem that can understand
                                           this disk */
-  u16 zero;                            /* 0 */
-  fnode_secno root;                    /* fnode of root directory */
-  secno n_sectors;                     /* size of filesystem */
-  u32 n_badblocks;                     /* number of bad blocks */
-  secno bitmaps;                       /* pointers to free space bit maps */
-  u32 zero1;                           /* 0 */
-  secno badblocks;                     /* bad block list */
-  u32 zero3;                           /* 0 */
-  time32_t last_chkdsk;                        /* date last checked, 0 if never */
-  time32_t last_optimize;              /* date last optimized, 0 if never */
-  secno n_dir_band;                    /* number of sectors in dir band */
-  secno dir_band_start;                        /* first sector in dir band */
-  secno dir_band_end;                  /* last sector in dir band */
-  secno dir_band_bitmap;               /* free space map, 1 dnode per bit */
+  __le16 zero;                         /* 0 */
+  __le32 root;                         /* fnode of root directory */
+  __le32 n_sectors;                    /* size of filesystem */
+  __le32 n_badblocks;                  /* number of bad blocks */
+  __le32 bitmaps;                      /* pointers to free space bit maps */
+  __le32 zero1;                                /* 0 */
+  __le32 badblocks;                    /* bad block list */
+  __le32 zero3;                                /* 0 */
+  __le32 last_chkdsk;                  /* date last checked, 0 if never */
+  __le32 last_optimize;                        /* date last optimized, 0 if never */
+  __le32 n_dir_band;                   /* number of sectors in dir band */
+  __le32 dir_band_start;                       /* first sector in dir band */
+  __le32 dir_band_end;                 /* last sector in dir band */
+  __le32 dir_band_bitmap;              /* free space map, 1 dnode per bit */
   u8 volume_name[32];                  /* not used */
-  secno user_id_table;                 /* 8 preallocated sectors - user id */
+  __le32 user_id_table;                        /* 8 preallocated sectors - user id */
   u32 zero6[103];                      /* 0 */
 };
 
@@ -109,8 +109,8 @@ struct hpfs_super_block
 
 struct hpfs_spare_block
 {
-  u32 magic;                           /* f991 1849 */
-  u32 magic1;                          /* fa52 29c5, more magic? */
+  __le32 magic;                                /* f991 1849 */
+  __le32 magic1;                               /* fa52 29c5, more magic? */
 
 #ifdef __LITTLE_ENDIAN
   u8 dirty: 1;                         /* 0 clean, 1 "improperly stopped" */
@@ -153,21 +153,21 @@ struct hpfs_spare_block
   u8 mm_contlgulty;
   u8 unused;
 
-  secno hotfix_map;                    /* info about remapped bad sectors */
-  u32 n_spares_used;                   /* number of hotfixes */
-  u32 n_spares;                                /* number of spares in hotfix map */
-  u32 n_dnode_spares_free;             /* spare dnodes unused */
-  u32 n_dnode_spares;                  /* length of spare_dnodes[] list,
+  __le32 hotfix_map;                   /* info about remapped bad sectors */
+  __le32 n_spares_used;                        /* number of hotfixes */
+  __le32 n_spares;                     /* number of spares in hotfix map */
+  __le32 n_dnode_spares_free;          /* spare dnodes unused */
+  __le32 n_dnode_spares;               /* length of spare_dnodes[] list,
                                           follows in this block*/
-  secno code_page_dir;                 /* code page directory block */
-  u32 n_code_pages;                    /* number of code pages */
-  u32 super_crc;                       /* on HPFS386 and LAN Server this is
+  __le32 code_page_dir;                        /* code page directory block */
+  __le32 n_code_pages;                 /* number of code pages */
+  __le32 super_crc;                    /* on HPFS386 and LAN Server this is
                                           checksum of superblock, on normal
                                           OS/2 unused */
-  u32 spare_crc;                       /* on HPFS386 checksum of spareblock */
-  u32 zero1[15];                       /* unused */
-  dnode_secno spare_dnodes[100];       /* emergency free dnode list */
-  u32 zero2[1];                                /* room for more? */
+  __le32 spare_crc;                    /* on HPFS386 checksum of spareblock */
+  __le32 zero1[15];                    /* unused */
+  __le32 spare_dnodes[100];            /* emergency free dnode list */
+  __le32 zero2[1];                     /* room for more? */
 };
 
 /* The bad block list is 4 sectors long.  The first word must be zero,
@@ -202,18 +202,18 @@ struct hpfs_spare_block
 
 struct code_page_directory
 {
-  u32 magic;                           /* 4945 21f7 */
-  u32 n_code_pages;                    /* number of pointers following */
-  u32 zero1[2];
+  __le32 magic;                                /* 4945 21f7 */
+  __le32 n_code_pages;                 /* number of pointers following */
+  __le32 zero1[2];
   struct {
-    u16 ix;                            /* index */
-    u16 code_page_number;              /* code page number */
-    u32 bounds;                                /* matches corresponding word
+    __le16 ix;                         /* index */
+    __le16 code_page_number;           /* code page number */
+    __le32 bounds;                     /* matches corresponding word
                                           in data block */
-    secno code_page_data;              /* sector number of a code_page_data
+    __le32 code_page_data;             /* sector number of a code_page_data
                                           containing c.p. array */
-    u16 index;                         /* index in c.p. array in that sector*/
-    u16 unknown;                       /* some unknown value; usually 0;
+    __le16 index;                      /* index in c.p. array in that sector*/
+    __le16 unknown;                    /* some unknown value; usually 0;
                                           2 in Japanese version */
   } array[31];                         /* unknown length */
 };
@@ -224,19 +224,19 @@ struct code_page_directory
 
 struct code_page_data
 {
-  u32 magic;                           /* 8945 21f7 */
-  u32 n_used;                          /* # elements used in c_p_data[] */
-  u32 bounds[3];                       /* looks a bit like
+  __le32 magic;                                /* 8945 21f7 */
+  __le32 n_used;                       /* # elements used in c_p_data[] */
+  __le32 bounds[3];                    /* looks a bit like
                                             (beg1,end1), (beg2,end2)
                                           one byte each */
-  u16 offs[3];                         /* offsets from start of sector
+  __le16 offs[3];                      /* offsets from start of sector
                                           to start of c_p_data[ix] */
   struct {
-    u16 ix;                            /* index */
-    u16 code_page_number;              /* code page number */
-    u16 unknown;                       /* the same as in cp directory */
+    __le16 ix;                         /* index */
+    __le16 code_page_number;           /* code page number */
+    __le16 unknown;                    /* the same as in cp directory */
     u8 map[128];                       /* upcase table for chars 80..ff */
-    u16 zero2;
+    __le16 zero2;
   } code_page[3];
   u8 incognita[78];
 };
@@ -278,8 +278,8 @@ struct code_page_data
 #define DNODE_MAGIC   0x77e40aae
 
 struct dnode {
-  u32 magic;                           /* 77e4 0aae */
-  u32 first_free;                      /* offset from start of dnode to
+  __le32 magic;                                /* 77e4 0aae */
+  __le32 first_free;                   /* offset from start of dnode to
                                           first free dir entry */
 #ifdef __LITTLE_ENDIAN
   u8 root_dnode: 1;                    /* Is it root dnode? */
@@ -293,14 +293,14 @@ struct dnode {
   u8 root_dnode: 1;                    /* Is it root dnode? */
 #endif
   u8 increment_me2[3];
-  secno up;                            /* (root dnode) directory's fnode
+  __le32 up;                           /* (root dnode) directory's fnode
                                           (nonroot) parent dnode */
-  dnode_secno self;                    /* pointer to this dnode */
+  __le32 self;                 /* pointer to this dnode */
   u8 dirent[2028];                     /* one or more dirents */
 };
 
 struct hpfs_dirent {
-  u16 length;                          /* offset to next dirent */
+  __le16 length;                       /* offset to next dirent */
 
 #ifdef __LITTLE_ENDIAN
   u8 first: 1;                         /* set on phony ^A^A (".") entry */
@@ -346,12 +346,12 @@ struct hpfs_dirent {
   u8 read_only: 1;                     /* dos attrib */
 #endif
 
-  fnode_secno fnode;                   /* fnode giving allocation info */
-  time32_t write_date;                 /* mtime */
-  u32 file_size;                       /* file length, bytes */
-  time32_t read_date;                  /* atime */
-  time32_t creation_date;                      /* ctime */
-  u32 ea_size;                         /* total EA length, bytes */
+  __le32 fnode;                                /* fnode giving allocation info */
+  __le32 write_date;                   /* mtime */
+  __le32 file_size;                    /* file length, bytes */
+  __le32 read_date;                    /* atime */
+  __le32 creation_date;                        /* ctime */
+  __le32 ea_size;                      /* total EA length, bytes */
   u8 no_of_acls;                       /* number of ACL's (low 3 bits) */
   u8 ix;                               /* code page index (of filename), see
                                           struct code_page_data */
@@ -375,50 +375,36 @@ struct hpfs_dirent {
 
 struct bplus_leaf_node
 {
-  u32 file_secno;                      /* first file sector in extent */
-  u32 length;                          /* length, sectors */
-  secno disk_secno;                    /* first corresponding disk sector */
+  __le32 file_secno;                   /* first file sector in extent */
+  __le32 length;                       /* length, sectors */
+  __le32 disk_secno;                   /* first corresponding disk sector */
 };
 
 struct bplus_internal_node
 {
-  u32 file_secno;                      /* subtree maps sectors < this  */
-  anode_secno down;                    /* pointer to subtree */
+  __le32 file_secno;                   /* subtree maps sectors < this  */
+  __le32 down;                         /* pointer to subtree */
 };
 
+enum {
+       BP_hbff = 1,
+       BP_fnode_parent = 0x20,
+       BP_binary_search = 0x40,
+       BP_internal = 0x80
+};
 struct bplus_header
 {
-#ifdef __LITTLE_ENDIAN
-  u8 hbff: 1;                  /* high bit of first free entry offset */
-  u8 flag1234: 4;
-  u8 fnode_parent: 1;                  /* ? we're pointed to by an fnode,
-                                          the data btree or some ea or the
-                                          main ea bootage pointer ea_secno */
-                                       /* also can get set in fnodes, which
-                                          may be a chkdsk glitch or may mean
-                                          this bit is irrelevant in fnodes,
-                                          or this interpretation is all wet */
-  u8 binary_search: 1;                 /* suggest binary search (unused) */
-  u8 internal: 1;                      /* 1 -> (internal) tree of anodes
-                                          0 -> (leaf) list of extents */
-#else
-  u8 internal: 1;                      /* 1 -> (internal) tree of anodes
-                                          0 -> (leaf) list of extents */
-  u8 binary_search: 1;                 /* suggest binary search (unused) */
-  u8 fnode_parent: 1;                  /* ? we're pointed to by an fnode,
+  u8 flags;                            /* bit 0 - high bit of first free entry offset
+                                          bit 5 - we're pointed to by an fnode,
                                           the data btree or some ea or the
-                                          main ea bootage pointer ea_secno */
-                                       /* also can get set in fnodes, which
-                                          may be a chkdsk glitch or may mean
-                                          this bit is irrelevant in fnodes,
-                                          or this interpretation is all wet */
-  u8 flag1234: 4;
-  u8 hbff: 1;                  /* high bit of first free entry offset */
-#endif
+                                          main ea bootage pointer ea_secno
+                                          bit 6 - suggest binary search (unused)
+                                          bit 7 - 1 -> (internal) tree of anodes
+                                                  0 -> (leaf) list of extents */
   u8 fill[3];
   u8 n_free_nodes;                     /* free nodes in following array */
   u8 n_used_nodes;                     /* used nodes in following array */
-  u16 first_free;                      /* offset from start of header to
+  __le16 first_free;                   /* offset from start of header to
                                           first free node in array */
   union {
     struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
@@ -428,6 +414,16 @@ struct bplus_header
   } u;
 };
 
+static inline bool bp_internal(struct bplus_header *bp)
+{
+       return bp->flags & BP_internal;
+}
+
+static inline bool bp_fnode_parent(struct bplus_header *bp)
+{
+       return bp->flags & BP_fnode_parent;
+}
+
 /* fnode: root of allocation b+ tree, and EA's */
 
 /* Every file and every directory has one fnode, pointed to by the directory
@@ -436,62 +432,56 @@ struct bplus_header
 
 #define FNODE_MAGIC 0xf7e40aae
 
+enum {FNODE_anode = cpu_to_le16(2), FNODE_dir = cpu_to_le16(256)};
 struct fnode
 {
-  u32 magic;                           /* f7e4 0aae */
-  u32 zero1[2];                                /* read history */
+  __le32 magic;                                /* f7e4 0aae */
+  __le32 zero1[2];                     /* read history */
   u8 len, name[15];                    /* true length, truncated name */
-  fnode_secno up;                      /* pointer to file's directory fnode */
-  secno acl_size_l;
-  secno acl_secno;
-  u16 acl_size_s;
+  __le32 up;                           /* pointer to file's directory fnode */
+  __le32 acl_size_l;
+  __le32 acl_secno;
+  __le16 acl_size_s;
   u8 acl_anode;
   u8 zero2;                            /* history bit count */
-  u32 ea_size_l;                       /* length of disk-resident ea's */
-  secno ea_secno;                      /* first sector of disk-resident ea's*/
-  u16 ea_size_s;                       /* length of fnode-resident ea's */
-
-#ifdef __LITTLE_ENDIAN
-  u8 flag0: 1;
-  u8 ea_anode: 1;                      /* 1 -> ea_secno is an anode */
-  u8 flag234567: 6;
-#else
-  u8 flag234567: 6;
-  u8 ea_anode: 1;                      /* 1 -> ea_secno is an anode */
-  u8 flag0: 1;
-#endif
+  __le32 ea_size_l;                    /* length of disk-resident ea's */
+  __le32 ea_secno;                     /* first sector of disk-resident ea's*/
+  __le16 ea_size_s;                    /* length of fnode-resident ea's */
 
-#ifdef __LITTLE_ENDIAN
-  u8 dirflag: 1;                       /* 1 -> directory.  first & only extent
-                                          points to dnode. */
-  u8 flag9012345: 7;
-#else
-  u8 flag9012345: 7;
-  u8 dirflag: 1;                       /* 1 -> directory.  first & only extent
+  __le16 flags;                                /* bit 1 set -> ea_secno is an anode */
+                                       /* bit 8 set -> directory.  first & only extent
                                           points to dnode. */
-#endif
-
   struct bplus_header btree;           /* b+ tree, 8 extents or 12 subtrees */
   union {
     struct bplus_leaf_node external[8];
     struct bplus_internal_node internal[12];
   } u;
 
-  u32 file_size;                       /* file length, bytes */
-  u32 n_needea;                                /* number of EA's with NEEDEA set */
+  __le32 file_size;                    /* file length, bytes */
+  __le32 n_needea;                     /* number of EA's with NEEDEA set */
   u8 user_id[16];                      /* unused */
-  u16 ea_offs;                         /* offset from start of fnode
+  __le16 ea_offs;                      /* offset from start of fnode
                                           to first fnode-resident ea */
   u8 dasd_limit_treshhold;
   u8 dasd_limit_delta;
-  u32 dasd_limit;
-  u32 dasd_usage;
+  __le32 dasd_limit;
+  __le32 dasd_usage;
   u8 ea[316];                          /* zero or more EA's, packed together
                                           with no alignment padding.
                                           (Do not use this name, get here
                                           via fnode + ea_offs. I think.) */
 };
 
+static inline bool fnode_in_anode(struct fnode *p)
+{
+       return (p->flags & FNODE_anode) != 0;
+}
+
+static inline bool fnode_is_dir(struct fnode *p)
+{
+       return (p->flags & FNODE_dir) != 0;
+}
+
 
 /* anode: 99.44% pure allocation tree */
 
@@ -499,9 +489,9 @@ struct fnode
 
 struct anode
 {
-  u32 magic;                           /* 37e4 0aae */
-  anode_secno self;                    /* pointer to this anode */
-  secno up;                            /* parent anode or fnode */
+  __le32 magic;                                /* 37e4 0aae */
+  __le32 self;                         /* pointer to this anode */
+  __le32 up;                           /* parent anode or fnode */
 
   struct bplus_header btree;           /* b+tree, 40 extents or 60 subtrees */
   union {
@@ -509,7 +499,7 @@ struct anode
     struct bplus_internal_node internal[60];
   } u;
 
-  u32 fill[3];                         /* unused */
+  __le32 fill[3];                      /* unused */
 };
 
 
@@ -528,32 +518,23 @@ struct anode
    run, or in multiple runs.  Flags in the fnode tell whether the EA list
    is immediate, in a single run, or in multiple runs. */
 
+enum {EA_indirect = 1, EA_anode = 2, EA_needea = 128 };
 struct extended_attribute
 {
-#ifdef __LITTLE_ENDIAN
-  u8 indirect: 1;                      /* 1 -> value gives sector number
+  u8 flags;                            /* bit 0 set -> value gives sector number
                                           where real value starts */
-  u8 anode: 1;                         /* 1 -> sector is an anode
+                                       /* bit 1 set -> sector is an anode
                                           that points to fragmented value */
-  u8 flag23456: 5;
-  u8 needea: 1;                                /* required ea */
-#else
-  u8 needea: 1;                                /* required ea */
-  u8 flag23456: 5;
-  u8 anode: 1;                         /* 1 -> sector is an anode
-                                          that points to fragmented value */
-  u8 indirect: 1;                      /* 1 -> value gives sector number
-                                          where real value starts */
-#endif
+                                       /* bit 7 set -> required ea */
   u8 namelen;                          /* length of name, bytes */
   u8 valuelen_lo;                      /* length of value, bytes */
   u8 valuelen_hi;                      /* length of value, bytes */
-  u8 name[0];
+  u8 name[];
   /*
     u8 name[namelen];                  ascii attrib name
     u8 nul;                            terminating '\0', not counted
     u8 value[valuelen];                        value, arbitrary
-      if this.indirect, valuelen is 8 and the value is
+      if this.flags & 1, valuelen is 8 and the value is
         u32 length;                    real length of value, bytes
         secno secno;                   sector address where it starts
       if this.anode, the above sector number is the root of an anode tree
@@ -561,6 +542,16 @@ struct extended_attribute
   */
 };
 
+static inline bool ea_indirect(struct extended_attribute *ea)
+{
+       return ea->flags & EA_indirect;
+}
+
+static inline bool ea_in_anode(struct extended_attribute *ea)
+{
+       return ea->flags & EA_anode;
+}
+
 /*
    Local Variables:
    comment-column: 40
index 6d2d5008fa435f22dcb876828e8b23984d3044b5..c07ef1f1ced60a0cf295772a218575d9c78e58d1 100644 (file)
@@ -75,7 +75,7 @@ struct hpfs_sb_info {
        unsigned char *sb_cp_table;     /* code page tables: */
                                        /*      128 bytes uppercasing table & */
                                        /*      128 bytes lowercasing table */
-       unsigned *sb_bmp_dir;           /* main bitmap directory */
+       __le32 *sb_bmp_dir;             /* main bitmap directory */
        unsigned sb_c_bitmap;           /* current bitmap */
        unsigned sb_max_fwd_alloc;      /* max forwad allocation */
        int sb_timeshift;
@@ -93,7 +93,7 @@ struct quad_buffer_head {
 static inline dnode_secno de_down_pointer (struct hpfs_dirent *de)
 {
   CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n"));
-  return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4));
+  return le32_to_cpu(*(__le32 *) ((void *) de + le16_to_cpu(de->length) - 4));
 }
 
 /* The first dir entry in a dnode */
@@ -141,12 +141,12 @@ static inline struct extended_attribute *next_ea(struct extended_attribute *ea)
 
 static inline secno ea_sec(struct extended_attribute *ea)
 {
-       return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen)));
+       return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 9 + ea->namelen)));
 }
 
 static inline secno ea_len(struct extended_attribute *ea)
 {
-       return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen)));
+       return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 5 + ea->namelen)));
 }
 
 static inline char *ea_data(struct extended_attribute *ea)
@@ -171,7 +171,7 @@ static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src)
        dst->not_8x3 = n;
 }
 
-static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n)
+static inline unsigned tstbits(__le32 *bmp, unsigned b, unsigned n)
 {
        int i;
        if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n;
@@ -268,10 +268,10 @@ void hpfs_evict_inode(struct inode *);
 
 /* map.c */
 
-unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
-unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
+__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
 unsigned char *hpfs_load_code_page(struct super_block *, secno);
-secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
+__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
 struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
 struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
 struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *);
index b43066cbdc6a7cd201538cdc86e28fd9d22deac8..ed671e0ea78443b35bb6d1dd3eabc64bd7559c1f 100644 (file)
@@ -110,7 +110,7 @@ void hpfs_read_inode(struct inode *i)
                        }
                }
        }
-       if (fnode->dirflag) {
+       if (fnode_is_dir(fnode)) {
                int n_dnodes, n_subdirs;
                i->i_mode |= S_IFDIR;
                i->i_op = &hpfs_dir_iops;
index a790821366a7f045d068fe47df517dc479b0ecce..4acb19d78359d4bec83f90b854680dc3962905cf 100644 (file)
@@ -8,12 +8,12 @@
 
 #include "hpfs_fn.h"
 
-unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
+__le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
 {
        return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0);
 }
 
-unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
+__le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
                         struct quad_buffer_head *qbh, char *id)
 {
        secno sec;
@@ -89,18 +89,18 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
        return cp_table;
 }
 
-secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
+__le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
 {
        struct buffer_head *bh;
        int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21;
        int i;
-       secno *b;
+       __le32 *b;
        if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
                printk("HPFS: can't allocate memory for bitmap directory\n");
                return NULL;
        }       
        for (i=0;i<n;i++) {
-               secno *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
+               __le32 *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
                if (!d) {
                        kfree(b);
                        return NULL;
@@ -130,16 +130,16 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea
                                        (unsigned long)ino);
                                goto bail;
                        }
-                       if (!fnode->dirflag) {
+                       if (!fnode_is_dir(fnode)) {
                                if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes !=
-                                   (fnode->btree.internal ? 12 : 8)) {
+                                   (bp_internal(&fnode->btree) ? 12 : 8)) {
                                        hpfs_error(s,
                                           "bad number of nodes in fnode %08lx",
                                            (unsigned long)ino);
                                        goto bail;
                                }
                                if (le16_to_cpu(fnode->btree.first_free) !=
-                                   8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) {
+                                   8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) {
                                        hpfs_error(s,
                                            "bad first_free pointer in fnode %08lx",
                                            (unsigned long)ino);
@@ -187,12 +187,12 @@ struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buff
                                goto bail;
                        }
                        if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes !=
-                           (anode->btree.internal ? 60 : 40)) {
+                           (bp_internal(&anode->btree) ? 60 : 40)) {
                                hpfs_error(s, "bad number of nodes in anode %08x", ano);
                                goto bail;
                        }
                        if (le16_to_cpu(anode->btree.first_free) !=
-                           8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) {
+                           8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) {
                                hpfs_error(s, "bad first_free pointer in anode %08x", ano);
                                goto bail;
                        }
index 30dd7b10b507a077877d58a2bb4d5ada18ee3101..9083ef8af58c162f7fd207f7ef37263b1f35de4f 100644 (file)
@@ -70,7 +70,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        fnode->len = len;
        memcpy(fnode->name, name, len > 15 ? 15 : len);
        fnode->up = cpu_to_le32(dir->i_ino);
-       fnode->dirflag = 1;
+       fnode->flags |= FNODE_dir;
        fnode->btree.n_free_nodes = 7;
        fnode->btree.n_used_nodes = 1;
        fnode->btree.first_free = cpu_to_le16(0x14);
index 54f6eccb79d9ed8c67f7ada5a96867ad4c61b37c..706a12c083ea726a7a268d647ae266b02a3a2ca7 100644 (file)
@@ -572,7 +572,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                mark_buffer_dirty(bh2);
        }
 
-       if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) {
+       if (spareblock->hotfixes_used || spareblock->n_spares_used) {
                if (errs >= 2) {
                        printk("HPFS: Hotfixes not supported here, try chkdsk\n");
                        mark_dirty(s, 0);
@@ -645,7 +645,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                root->i_mtime.tv_nsec = 0;
                root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
                root->i_ctime.tv_nsec = 0;
-               hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size);
+               hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size);
                hpfs_i(root)->i_parent_dir = root->i_ino;
                if (root->i_size == -1)
                        root->i_size = 2048;
index c474c1d7062bcdf0aa32d86033f4f98110fb662e..c99163b1b31036ef68974c0c5dbc192f8f73f4da 100644 (file)
@@ -1487,10 +1487,30 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
        return 0;
 }
 
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+       if (inode->i_op->update_time)
+               return inode->i_op->update_time(inode, time, flags);
+
+       if (flags & S_ATIME)
+               inode->i_atime = *time;
+       if (flags & S_VERSION)
+               inode_inc_iversion(inode);
+       if (flags & S_CTIME)
+               inode->i_ctime = *time;
+       if (flags & S_MTIME)
+               inode->i_mtime = *time;
+       mark_inode_dirty_sync(inode);
+       return 0;
+}
+
 /**
  *     touch_atime     -       update the access time
- *     @mnt: mount the inode is accessed on
- *     @dentry: dentry accessed
+ *     @path: the &struct path to update
  *
  *     Update the accessed time on an inode and mark it for writeback.
  *     This function automatically handles read only file systems and media,
@@ -1525,12 +1545,83 @@ void touch_atime(struct path *path)
        if (mnt_want_write(mnt))
                return;
 
-       inode->i_atime = now;
-       mark_inode_dirty_sync(inode);
+       /*
+        * File systems can error out when updating inodes if they need to
+        * allocate new space to modify an inode (such is the case for
+        * Btrfs), but since we touch atime while walking down the path we
+        * really don't care if we failed to update the atime of the file,
+        * so just ignore the return value.
+        */
+       update_time(inode, &now, S_ATIME);
        mnt_drop_write(mnt);
 }
 EXPORT_SYMBOL(touch_atime);
 
+/*
+ * The logic we want is
+ *
+ *     if suid or (sgid and xgrp)
+ *             remove privs
+ */
+int should_remove_suid(struct dentry *dentry)
+{
+       umode_t mode = dentry->d_inode->i_mode;
+       int kill = 0;
+
+       /* suid always must be killed */
+       if (unlikely(mode & S_ISUID))
+               kill = ATTR_KILL_SUID;
+
+       /*
+        * sgid without any exec bits is just a mandatory locking mark; leave
+        * it alone.  If some exec bits are set, it's a real sgid; kill it.
+        */
+       if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+               kill |= ATTR_KILL_SGID;
+
+       if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+               return kill;
+
+       return 0;
+}
+EXPORT_SYMBOL(should_remove_suid);
+
+static int __remove_suid(struct dentry *dentry, int kill)
+{
+       struct iattr newattrs;
+
+       newattrs.ia_valid = ATTR_FORCE | kill;
+       return notify_change(dentry, &newattrs);
+}
+
+int file_remove_suid(struct file *file)
+{
+       struct dentry *dentry = file->f_path.dentry;
+       struct inode *inode = dentry->d_inode;
+       int killsuid;
+       int killpriv;
+       int error = 0;
+
+       /* Fast path for nothing security related */
+       if (IS_NOSEC(inode))
+               return 0;
+
+       killsuid = should_remove_suid(dentry);
+       killpriv = security_inode_need_killpriv(dentry);
+
+       if (killpriv < 0)
+               return killpriv;
+       if (killpriv)
+               error = security_inode_killpriv(dentry);
+       if (!error && killsuid)
+               error = __remove_suid(dentry, killsuid);
+       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+               inode->i_flags |= S_NOSEC;
+
+       return error;
+}
+EXPORT_SYMBOL(file_remove_suid);
+
 /**
  *     file_update_time        -       update mtime and ctime time
  *     @file: file accessed
@@ -1540,18 +1631,20 @@ EXPORT_SYMBOL(touch_atime);
  *     usage in the file write path of filesystems, and filesystems may
  *     choose to explicitly ignore update via this function with the
  *     S_NOCMTIME inode flag, e.g. for network filesystem where these
- *     timestamps are handled by the server.
+ *     timestamps are handled by the server.  This can return an error for
+ *     file systems who need to allocate space in order to update an inode.
  */
 
-void file_update_time(struct file *file)
+int file_update_time(struct file *file)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
        struct timespec now;
-       enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+       int sync_it = 0;
+       int ret;
 
        /* First try to exhaust all avenues to not sync */
        if (IS_NOCMTIME(inode))
-               return;
+               return 0;
 
        now = current_fs_time(inode->i_sb);
        if (!timespec_equal(&inode->i_mtime, &now))
@@ -1564,21 +1657,16 @@ void file_update_time(struct file *file)
                sync_it |= S_VERSION;
 
        if (!sync_it)
-               return;
+               return 0;
 
        /* Finally allowed to write? Takes lock. */
        if (mnt_want_write_file(file))
-               return;
+               return 0;
 
-       /* Only change inode inside the lock region */
-       if (sync_it & S_VERSION)
-               inode_inc_iversion(inode);
-       if (sync_it & S_CTIME)
-               inode->i_ctime = now;
-       if (sync_it & S_MTIME)
-               inode->i_mtime = now;
-       mark_inode_dirty_sync(inode);
+       ret = update_time(inode, &now, sync_it);
        mnt_drop_write_file(file);
+
+       return ret;
 }
 EXPORT_SYMBOL(file_update_time);
 
index 9962c59ba280b1c75d78adc55b8491733075a5e0..18bc216ea09d95ecff126ef96987ff786b5cbcb1 100644 (file)
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *);
 
 extern void __init mnt_init(void);
 
-DECLARE_BRLOCK(vfsmount_lock);
+extern struct lglock vfsmount_lock;
 
 
 /*
@@ -100,6 +100,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
 
 extern long do_handle_open(int mountdirfd,
                           struct file_handle __user *ufh, int open_flag);
+extern int open_check_o_direct(struct file *f);
 
 /*
  * inode.c
index dd4687ff30d09900a14f113aec870007cfcfb7f0..aa4356d09eeeb03167bcf506a7fe8ad98efaba39 100644 (file)
@@ -107,12 +107,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
 }
 
 static int
-isofs_export_encode_fh(struct dentry *dentry,
+isofs_export_encode_fh(struct inode *inode,
                       __u32 *fh32,
                       int *max_len,
-                      int connectable)
+                      struct inode *parent)
 {
-       struct inode * inode = dentry->d_inode;
        struct iso_inode_info * ei = ISOFS_I(inode);
        int len = *max_len;
        int type = 1;
@@ -124,7 +123,7 @@ isofs_export_encode_fh(struct dentry *dentry,
         * offset of the inode and the upper 16 bits of fh32[1] to
         * hold the offset of the parent.
         */
-       if (connectable && (len < 5)) {
+       if (parent && (len < 5)) {
                *max_len = 5;
                return 255;
        } else if (len < 3) {
@@ -136,16 +135,12 @@ isofs_export_encode_fh(struct dentry *dentry,
        fh32[0] = ei->i_iget5_block;
        fh16[2] = (__u16)ei->i_iget5_offset;  /* fh16 [sic] */
        fh32[2] = inode->i_generation;
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
+       if (parent) {
                struct iso_inode_info *eparent;
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
                eparent = ISOFS_I(parent);
                fh32[3] = eparent->i_iget5_block;
                fh16[3] = (__u16)eparent->i_iget5_offset;  /* fh16 [sic] */
                fh32[4] = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
                len = 5;
                type = 2;
        }
index f32f346f4b0a521a5b6bbaedc7b0a5a7750c4b1e..69a48c2944da682c8a133fe75183c086ef08813b 100644 (file)
@@ -1,6 +1,8 @@
 config JBD2
        tristate
        select CRC32
+       select CRYPTO
+       select CRYPTO_CRC32C
        help
          This is a generic journaling layer for block devices that support
          both 32-bit and 64-bit block numbers.  It is currently used by
index 840f70f507924a0ac4db70a9d729f715783b49be..216f4299f65e7e2f1e26859c8e1247cdf71c55df 100644 (file)
@@ -85,6 +85,24 @@ nope:
        __brelse(bh);
 }
 
+static void jbd2_commit_block_csum_set(journal_t *j,
+                                      struct journal_head *descriptor)
+{
+       struct commit_header *h;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+       h->h_chksum_type = 0;
+       h->h_chksum_size = 0;
+       h->h_chksum[0] = 0;
+       csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+                          j->j_blocksize);
+       h->h_chksum[0] = cpu_to_be32(csum);
+}
+
 /*
  * Done it all: now submit the commit record.  We should have
  * cleaned up our previous buffers by now, so if we are in abort
@@ -128,6 +146,7 @@ static int journal_submit_commit_record(journal_t *journal,
                tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
                tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
        }
+       jbd2_commit_block_csum_set(journal, descriptor);
 
        JBUFFER_TRACE(descriptor, "submit commit block");
        lock_buffer(bh);
@@ -301,6 +320,44 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
                tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
 }
 
+static void jbd2_descr_block_csum_set(journal_t *j,
+                                     struct journal_head *descriptor)
+{
+       struct jbd2_journal_block_tail *tail;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       tail = (struct jbd2_journal_block_tail *)
+                       (jh2bh(descriptor)->b_data + j->j_blocksize -
+                       sizeof(struct jbd2_journal_block_tail));
+       tail->t_checksum = 0;
+       csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+                          j->j_blocksize);
+       tail->t_checksum = cpu_to_be32(csum);
+}
+
+static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
+                                   struct buffer_head *bh, __u32 sequence)
+{
+       struct page *page = bh->b_page;
+       __u8 *addr;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       sequence = cpu_to_be32(sequence);
+       addr = kmap_atomic(page, KM_USER0);
+       csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+                         sizeof(sequence));
+       csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
+                         bh->b_size);
+       kunmap_atomic(addr, KM_USER0);
+
+       tag->t_checksum = cpu_to_be32(csum);
+}
 /*
  * jbd2_journal_commit_transaction
  *
@@ -334,6 +391,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        unsigned long first_block;
        tid_t first_tid;
        int update_tail;
+       int csum_size = 0;
+
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               csum_size = sizeof(struct jbd2_journal_block_tail);
 
        /*
         * First job: lock down the current transaction and wait for
@@ -627,7 +688,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
                tag = (journal_block_tag_t *) tagp;
                write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
-               tag->t_flags = cpu_to_be32(tag_flag);
+               tag->t_flags = cpu_to_be16(tag_flag);
+               jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+                                       commit_transaction->t_tid);
                tagp += tag_bytes;
                space_left -= tag_bytes;
 
@@ -643,7 +706,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 
                if (bufs == journal->j_wbufsize ||
                    commit_transaction->t_buffers == NULL ||
-                   space_left < tag_bytes + 16) {
+                   space_left < tag_bytes + 16 + csum_size) {
 
                        jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
 
@@ -651,8 +714,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                            submitting the IOs.  "tag" still points to
                            the last tag we set up. */
 
-                       tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+                       tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
 
+                       jbd2_descr_block_csum_set(journal, descriptor);
 start_journal_io:
                        for (i = 0; i < bufs; i++) {
                                struct buffer_head *bh = wbuf[i];
index 1afb701622b0b17748b4cd7f7d171df139bcc9cc..e9a3c4c85594e30aca1ed1f14d5667ba0595160a 100644 (file)
@@ -97,6 +97,43 @@ EXPORT_SYMBOL(jbd2_inode_cache);
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
 
+/* Checksumming functions */
+int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+{
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
+}
+
+static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+{
+       __u32 csum, old_csum;
+
+       old_csum = sb->s_checksum;
+       sb->s_checksum = 0;
+       csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+       sb->s_checksum = old_csum;
+
+       return cpu_to_be32(csum);
+}
+
+int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+{
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       return sb->s_checksum == jbd2_superblock_csum(j, sb);
+}
+
+void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+{
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       sb->s_checksum = jbd2_superblock_csum(j, sb);
+}
+
 /*
  * Helper function used to manage commit timeouts
  */
@@ -1348,6 +1385,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
        jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
                  journal->j_errno);
        sb->s_errno    = cpu_to_be32(journal->j_errno);
+       jbd2_superblock_csum_set(journal, sb);
        read_unlock(&journal->j_state_lock);
 
        jbd2_write_superblock(journal, WRITE_SYNC);
@@ -1376,6 +1414,9 @@ static int journal_get_superblock(journal_t *journal)
                }
        }
 
+       if (buffer_verified(bh))
+               return 0;
+
        sb = journal->j_superblock;
 
        err = -EINVAL;
@@ -1413,6 +1454,43 @@ static int journal_get_superblock(journal_t *journal)
                goto out;
        }
 
+       if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
+           JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+               /* Can't have checksum v1 and v2 on at the same time! */
+               printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 "
+                      "at the same time!\n");
+               goto out;
+       }
+
+       if (!jbd2_verify_csum_type(journal, sb)) {
+               printk(KERN_ERR "JBD: Unknown checksum type\n");
+               goto out;
+       }
+
+       /* Load the checksum driver */
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+               journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+               if (IS_ERR(journal->j_chksum_driver)) {
+                       printk(KERN_ERR "JBD: Cannot load crc32c driver.\n");
+                       err = PTR_ERR(journal->j_chksum_driver);
+                       journal->j_chksum_driver = NULL;
+                       goto out;
+               }
+       }
+
+       /* Check superblock checksum */
+       if (!jbd2_superblock_csum_verify(journal, sb)) {
+               printk(KERN_ERR "JBD: journal checksum error\n");
+               goto out;
+       }
+
+       /* Precompute checksum seed for all metadata */
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+                                                  sizeof(sb->s_uuid));
+
+       set_buffer_verified(bh);
+
        return 0;
 
 out:
@@ -1564,6 +1642,8 @@ int jbd2_journal_destroy(journal_t *journal)
                iput(journal->j_inode);
        if (journal->j_revoke)
                jbd2_journal_destroy_revoke(journal);
+       if (journal->j_chksum_driver)
+               crypto_free_shash(journal->j_chksum_driver);
        kfree(journal->j_wbuf);
        kfree(journal);
 
@@ -1653,6 +1733,10 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
 int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
                          unsigned long ro, unsigned long incompat)
 {
+#define INCOMPAT_FEATURE_ON(f) \
+               ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
+#define COMPAT_FEATURE_ON(f) \
+               ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
        journal_superblock_t *sb;
 
        if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
@@ -1661,16 +1745,54 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
        if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
                return 0;
 
+       /* Asking for checksumming v2 and v1?  Only give them v2. */
+       if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+           compat & JBD2_FEATURE_COMPAT_CHECKSUM)
+               compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
+
        jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
                  compat, ro, incompat);
 
        sb = journal->j_superblock;
 
+       /* If enabling v2 checksums, update superblock */
+       if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+               sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
+               sb->s_feature_compat &=
+                       ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
+
+               /* Load the checksum driver */
+               if (journal->j_chksum_driver == NULL) {
+                       journal->j_chksum_driver = crypto_alloc_shash("crc32c",
+                                                                     0, 0);
+                       if (IS_ERR(journal->j_chksum_driver)) {
+                               printk(KERN_ERR "JBD: Cannot load crc32c "
+                                      "driver.\n");
+                               journal->j_chksum_driver = NULL;
+                               return 0;
+                       }
+               }
+
+               /* Precompute checksum seed for all metadata */
+               if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                             JBD2_FEATURE_INCOMPAT_CSUM_V2))
+                       journal->j_csum_seed = jbd2_chksum(journal, ~0,
+                                                          sb->s_uuid,
+                                                          sizeof(sb->s_uuid));
+       }
+
+       /* If enabling v1 checksums, downgrade superblock */
+       if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
+               sb->s_feature_incompat &=
+                       ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+
        sb->s_feature_compat    |= cpu_to_be32(compat);
        sb->s_feature_ro_compat |= cpu_to_be32(ro);
        sb->s_feature_incompat  |= cpu_to_be32(incompat);
 
        return 1;
+#undef COMPAT_FEATURE_ON
+#undef INCOMPAT_FEATURE_ON
 }
 
 /*
@@ -1975,10 +2097,16 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
  */
 size_t journal_tag_bytes(journal_t *journal)
 {
+       journal_block_tag_t tag;
+       size_t x = 0;
+
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               x += sizeof(tag.t_checksum);
+
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
-               return JBD2_TAG_SIZE64;
+               return x + JBD2_TAG_SIZE64;
        else
-               return JBD2_TAG_SIZE32;
+               return x + JBD2_TAG_SIZE32;
 }
 
 /*
index c1a03354a22ff1b5a787251b422afcb5225ca2c9..0131e4362534c4d5b83273130ee292463ec49f07 100644 (file)
@@ -174,6 +174,25 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
        return 0;
 }
 
+static int jbd2_descr_block_csum_verify(journal_t *j,
+                                       void *buf)
+{
+       struct jbd2_journal_block_tail *tail;
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
+                       sizeof(struct jbd2_journal_block_tail));
+       provided = tail->t_checksum;
+       tail->t_checksum = 0;
+       calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+       tail->t_checksum = provided;
+
+       provided = be32_to_cpu(provided);
+       return provided == calculated;
+}
 
 /*
  * Count the number of in-use tags in a journal descriptor block.
@@ -186,6 +205,9 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
        int                     nr = 0, size = journal->j_blocksize;
        int                     tag_bytes = journal_tag_bytes(journal);
 
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               size -= sizeof(struct jbd2_journal_block_tail);
+
        tagp = &bh->b_data[sizeof(journal_header_t)];
 
        while ((tagp - bh->b_data + tag_bytes) <= size) {
@@ -193,10 +215,10 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
 
                nr++;
                tagp += tag_bytes;
-               if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+               if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
                        tagp += 16;
 
-               if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+               if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
                        break;
        }
 
@@ -353,6 +375,41 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
        return 0;
 }
 
+static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
+{
+       struct commit_header *h;
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       h = buf;
+       provided = h->h_chksum[0];
+       h->h_chksum[0] = 0;
+       calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+       h->h_chksum[0] = provided;
+
+       provided = be32_to_cpu(provided);
+       return provided == calculated;
+}
+
+static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
+                                     void *buf, __u32 sequence)
+{
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       sequence = cpu_to_be32(sequence);
+       calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+                                sizeof(sequence));
+       calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
+       provided = be32_to_cpu(tag->t_checksum);
+
+       return provided == cpu_to_be32(calculated);
+}
+
 static int do_one_pass(journal_t *journal,
                        struct recovery_info *info, enum passtype pass)
 {
@@ -366,6 +423,7 @@ static int do_one_pass(journal_t *journal,
        int                     blocktype;
        int                     tag_bytes = journal_tag_bytes(journal);
        __u32                   crc32_sum = ~0; /* Transactional Checksums */
+       int                     descr_csum_size = 0;
 
        /*
         * First thing is to establish what we expect to find in the log
@@ -451,6 +509,18 @@ static int do_one_pass(journal_t *journal,
 
                switch(blocktype) {
                case JBD2_DESCRIPTOR_BLOCK:
+                       /* Verify checksum first */
+                       if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                       JBD2_FEATURE_INCOMPAT_CSUM_V2))
+                               descr_csum_size =
+                                       sizeof(struct jbd2_journal_block_tail);
+                       if (descr_csum_size > 0 &&
+                           !jbd2_descr_block_csum_verify(journal,
+                                                         bh->b_data)) {
+                               err = -EIO;
+                               goto failed;
+                       }
+
                        /* If it is a valid descriptor block, replay it
                         * in pass REPLAY; if journal_checksums enabled, then
                         * calculate checksums in PASS_SCAN, otherwise,
@@ -481,11 +551,11 @@ static int do_one_pass(journal_t *journal,
 
                        tagp = &bh->b_data[sizeof(journal_header_t)];
                        while ((tagp - bh->b_data + tag_bytes)
-                              <= journal->j_blocksize) {
+                              <= journal->j_blocksize - descr_csum_size) {
                                unsigned long io_block;
 
                                tag = (journal_block_tag_t *) tagp;
-                               flags = be32_to_cpu(tag->t_flags);
+                               flags = be16_to_cpu(tag->t_flags);
 
                                io_block = next_log_block++;
                                wrap(journal, next_log_block);
@@ -516,6 +586,19 @@ static int do_one_pass(journal_t *journal,
                                                goto skip_write;
                                        }
 
+                                       /* Look for block corruption */
+                                       if (!jbd2_block_tag_csum_verify(
+                                               journal, tag, obh->b_data,
+                                               be32_to_cpu(tmp->h_sequence))) {
+                                               brelse(obh);
+                                               success = -EIO;
+                                               printk(KERN_ERR "JBD: Invalid "
+                                                      "checksum recovering "
+                                                      "block %llu in log\n",
+                                                      blocknr);
+                                               continue;
+                                       }
+
                                        /* Find a buffer for the new
                                         * data being restored */
                                        nbh = __getblk(journal->j_fs_dev,
@@ -650,6 +733,19 @@ static int do_one_pass(journal_t *journal,
                                }
                                crc32_sum = ~0;
                        }
+                       if (pass == PASS_SCAN &&
+                           !jbd2_commit_block_csum_verify(journal,
+                                                          bh->b_data)) {
+                               info->end_transaction = next_commit_ID;
+
+                               if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                    JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                                       journal->j_failed_commit =
+                                               next_commit_ID;
+                                       brelse(bh);
+                                       break;
+                               }
+                       }
                        brelse(bh);
                        next_commit_ID++;
                        continue;
@@ -706,6 +802,25 @@ static int do_one_pass(journal_t *journal,
        return err;
 }
 
+static int jbd2_revoke_block_csum_verify(journal_t *j,
+                                        void *buf)
+{
+       struct jbd2_journal_revoke_tail *tail;
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
+                       sizeof(struct jbd2_journal_revoke_tail));
+       provided = tail->r_checksum;
+       tail->r_checksum = 0;
+       calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+       tail->r_checksum = provided;
+
+       provided = be32_to_cpu(provided);
+       return provided == calculated;
+}
 
 /* Scan a revoke record, marking all blocks mentioned as revoked. */
 
@@ -720,6 +835,9 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
        offset = sizeof(jbd2_journal_revoke_header_t);
        max = be32_to_cpu(header->r_count);
 
+       if (!jbd2_revoke_block_csum_verify(journal, header))
+               return -EINVAL;
+
        if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
                record_len = 8;
 
index 6973705d6a3d9db1c96ed67f55c97c8a13ee2f6d..f30b80b4ce8bef98cab621bf731e13682661ca6d 100644 (file)
@@ -578,6 +578,7 @@ static void write_one_revoke_record(journal_t *journal,
                                    struct jbd2_revoke_record_s *record,
                                    int write_op)
 {
+       int csum_size = 0;
        struct journal_head *descriptor;
        int offset;
        journal_header_t *header;
@@ -592,9 +593,13 @@ static void write_one_revoke_record(journal_t *journal,
        descriptor = *descriptorp;
        offset = *offsetp;
 
+       /* Do we need to leave space at the end for a checksum? */
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               csum_size = sizeof(struct jbd2_journal_revoke_tail);
+
        /* Make sure we have a descriptor with space left for the record */
        if (descriptor) {
-               if (offset == journal->j_blocksize) {
+               if (offset >= journal->j_blocksize - csum_size) {
                        flush_descriptor(journal, descriptor, offset, write_op);
                        descriptor = NULL;
                }
@@ -631,6 +636,24 @@ static void write_one_revoke_record(journal_t *journal,
        *offsetp = offset;
 }
 
+static void jbd2_revoke_csum_set(journal_t *j,
+                                struct journal_head *descriptor)
+{
+       struct jbd2_journal_revoke_tail *tail;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       tail = (struct jbd2_journal_revoke_tail *)
+                       (jh2bh(descriptor)->b_data + j->j_blocksize -
+                       sizeof(struct jbd2_journal_revoke_tail));
+       tail->r_checksum = 0;
+       csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+                          j->j_blocksize);
+       tail->r_checksum = cpu_to_be32(csum);
+}
+
 /*
  * Flush a revoke descriptor out to the journal.  If we are aborting,
  * this is a noop; otherwise we are generating a buffer which needs to
@@ -652,6 +675,8 @@ static void flush_descriptor(journal_t *journal,
 
        header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
        header->r_count = cpu_to_be32(offset);
+       jbd2_revoke_csum_set(journal, descriptor);
+
        set_buffer_jwrite(bh);
        BUFFER_TRACE(bh, "write");
        set_buffer_dirty(bh);
index ddcd3549c6c26cbc9cb9dd46831b189ed3c0441e..fb1ab9533b67277a557cd5f8ea9f7216b8284d4e 100644 (file)
@@ -162,8 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 
 alloc_transaction:
        if (!journal->j_running_transaction) {
-               new_transaction = kmem_cache_alloc(transaction_cache,
-                                                  gfp_mask | __GFP_ZERO);
+               new_transaction = kmem_cache_zalloc(transaction_cache,
+                                                   gfp_mask);
                if (!new_transaction) {
                        /*
                         * If __GFP_FS is not present, then we may be
index 55a0c1dceadfddcf990b8fdbcfec015fc75fab32..44dca1f041c5cbc2b057e3eb7e13840c3edd25d7 100644 (file)
@@ -126,6 +126,10 @@ struct jffs2_sb_info {
        struct jffs2_inodirty *wbuf_inodes;
        struct rw_semaphore wbuf_sem;   /* Protects the write buffer */
 
+       struct delayed_work wbuf_dwork; /* write-buffer write-out work */
+       int wbuf_queued;                /* non-zero delayed work is queued */
+       spinlock_t wbuf_dwork_lock;     /* protects wbuf_dwork and and wbuf_queued */
+
        unsigned char *oobbuf;
        int oobavail; /* How many bytes are available for JFFS2 in OOB */
 #endif
index 1cd3aec9d9ae282dd31226d0717aaf69a55f414d..bcd983d7e7f99e7e295decc1d26092d464a14d9f 100644 (file)
@@ -95,6 +95,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
 #define jffs2_ubivol(c) (0)
 #define jffs2_ubivol_setup(c) (0)
 #define jffs2_ubivol_cleanup(c) do {} while (0)
+#define jffs2_dirty_trigger(c) do {} while (0)
 
 #else /* NAND and/or ECC'd NOR support present */
 
@@ -135,14 +136,10 @@ void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
 #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
 int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
+void jffs2_dirty_trigger(struct jffs2_sb_info *c);
 
 #endif /* WRITEBUFFER */
 
-static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
-{
-       OFNI_BS_2SFFJ(c)->s_dirt = 1;
-}
-
 /* background.c */
 int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c);
 void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
index f9916f312bd81e3590fde1c92a025458cb64ab11..bc586f204228633ed2d3bcf388991a0ec4b21ff8 100644 (file)
@@ -63,21 +63,6 @@ static void jffs2_i_init_once(void *foo)
        inode_init_once(&f->vfs_inode);
 }
 
-static void jffs2_write_super(struct super_block *sb)
-{
-       struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-
-       lock_super(sb);
-       sb->s_dirt = 0;
-
-       if (!(sb->s_flags & MS_RDONLY)) {
-               jffs2_dbg(1, "%s()\n", __func__);
-               jffs2_flush_wbuf_gc(c, 0);
-       }
-
-       unlock_super(sb);
-}
-
 static const char *jffs2_compr_name(unsigned int compr)
 {
        switch (compr) {
@@ -113,8 +98,6 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
        struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
-       jffs2_write_super(sb);
-
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
@@ -251,7 +234,6 @@ static const struct super_operations jffs2_super_operations =
        .alloc_inode =  jffs2_alloc_inode,
        .destroy_inode =jffs2_destroy_inode,
        .put_super =    jffs2_put_super,
-       .write_super =  jffs2_write_super,
        .statfs =       jffs2_statfs,
        .remount_fs =   jffs2_remount_fs,
        .evict_inode =  jffs2_evict_inode,
@@ -319,9 +301,6 @@ static void jffs2_put_super (struct super_block *sb)
 
        jffs2_dbg(2, "%s()\n", __func__);
 
-       if (sb->s_dirt)
-               jffs2_write_super(sb);
-
        mutex_lock(&c->alloc_sem);
        jffs2_flush_wbuf_pad(c);
        mutex_unlock(&c->alloc_sem);
index 74d9be19df3f1fff1d7defdc7824c90240a302f6..6f4529d3697fd3f97d5b018dbe9f5c0362cee034 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/mtd/nand.h>
 #include <linux/jiffies.h>
 #include <linux/sched.h>
+#include <linux/writeback.h>
 
 #include "nodelist.h"
 
@@ -85,7 +86,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
 {
        struct jffs2_inodirty *new;
 
-       /* Mark the superblock dirty so that kupdated will flush... */
+       /* Schedule delayed write-buffer write-out */
        jffs2_dirty_trigger(c);
 
        if (jffs2_wbuf_pending_for_ino(c, ino))
@@ -1148,6 +1149,47 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
        return 1;
 }
 
+static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
+{
+       struct delayed_work *dwork;
+
+       dwork = container_of(work, struct delayed_work, work);
+       return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
+}
+
+static void delayed_wbuf_sync(struct work_struct *work)
+{
+       struct jffs2_sb_info *c = work_to_sb(work);
+       struct super_block *sb = OFNI_BS_2SFFJ(c);
+
+       spin_lock(&c->wbuf_dwork_lock);
+       c->wbuf_queued = 0;
+       spin_unlock(&c->wbuf_dwork_lock);
+
+       if (!(sb->s_flags & MS_RDONLY)) {
+               jffs2_dbg(1, "%s()\n", __func__);
+               jffs2_flush_wbuf_gc(c, 0);
+       }
+}
+
+void jffs2_dirty_trigger(struct jffs2_sb_info *c)
+{
+       struct super_block *sb = OFNI_BS_2SFFJ(c);
+       unsigned long delay;
+
+       if (sb->s_flags & MS_RDONLY)
+               return;
+
+       spin_lock(&c->wbuf_dwork_lock);
+       if (!c->wbuf_queued) {
+               jffs2_dbg(1, "%s()\n", __func__);
+               delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+               queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay);
+               c->wbuf_queued = 1;
+       }
+       spin_unlock(&c->wbuf_dwork_lock);
+}
+
 int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 {
        struct nand_ecclayout *oinfo = c->mtd->ecclayout;
@@ -1169,6 +1211,8 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
 
        /* Initialise write buffer */
        init_rwsem(&c->wbuf_sem);
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
        c->wbuf_pagesize = c->mtd->writesize;
        c->wbuf_ofs = 0xFFFFFFFF;
 
@@ -1207,8 +1251,8 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
 
        /* Initialize write buffer */
        init_rwsem(&c->wbuf_sem);
-
-
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
        c->wbuf_pagesize =  c->mtd->erasesize;
 
        /* Find a suitable c->sector_size
@@ -1267,6 +1311,9 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
 
        /* Initialize write buffer */
        init_rwsem(&c->wbuf_sem);
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
+
        c->wbuf_pagesize = c->mtd->writesize;
        c->wbuf_ofs = 0xFFFFFFFF;
 
@@ -1299,6 +1346,8 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
                return 0;
 
        init_rwsem(&c->wbuf_sem);
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
 
        c->wbuf_pagesize =  c->mtd->writesize;
        c->wbuf_ofs = 0xFFFFFFFF;
index 1ead0750cdbb00d680320ffb5b5747a504500b44..80938fda67e0e6fde67999d3556820b87b6acd33 100644 (file)
@@ -251,39 +251,40 @@ out_err:
        return err;
 }
 
-static int lockd_up_net(struct net *net)
+static int lockd_up_net(struct svc_serv *serv, struct net *net)
 {
        struct lockd_net *ln = net_generic(net, lockd_net_id);
-       struct svc_serv *serv = nlmsvc_rqst->rq_server;
        int error;
 
-       if (ln->nlmsvc_users)
+       if (ln->nlmsvc_users++)
                return 0;
 
-       error = svc_rpcb_setup(serv, net);
+       error = svc_bind(serv, net);
        if (error)
-               goto err_rpcb;
+               goto err_bind;
 
        error = make_socks(serv, net);
        if (error < 0)
                goto err_socks;
+       dprintk("lockd_up_net: per-net data created; net=%p\n", net);
        return 0;
 
 err_socks:
        svc_rpcb_cleanup(serv, net);
-err_rpcb:
+err_bind:
+       ln->nlmsvc_users--;
        return error;
 }
 
-static void lockd_down_net(struct net *net)
+static void lockd_down_net(struct svc_serv *serv, struct net *net)
 {
        struct lockd_net *ln = net_generic(net, lockd_net_id);
-       struct svc_serv *serv = nlmsvc_rqst->rq_server;
 
        if (ln->nlmsvc_users) {
                if (--ln->nlmsvc_users == 0) {
                        nlm_shutdown_hosts_net(net);
                        svc_shutdown_net(serv, net);
+                       dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
                }
        } else {
                printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
@@ -292,21 +293,60 @@ static void lockd_down_net(struct net *net)
        }
 }
 
-/*
- * Bring up the lockd process if it's not already up.
- */
-int lockd_up(struct net *net)
+static int lockd_start_svc(struct svc_serv *serv)
+{
+       int error;
+
+       if (nlmsvc_rqst)
+               return 0;
+
+       /*
+        * Create the kernel thread and wait for it to start.
+        */
+       nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+       if (IS_ERR(nlmsvc_rqst)) {
+               error = PTR_ERR(nlmsvc_rqst);
+               printk(KERN_WARNING
+                       "lockd_up: svc_rqst allocation failed, error=%d\n",
+                       error);
+               goto out_rqst;
+       }
+
+       svc_sock_update_bufs(serv);
+       serv->sv_maxconn = nlm_max_connections;
+
+       nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+       if (IS_ERR(nlmsvc_task)) {
+               error = PTR_ERR(nlmsvc_task);
+               printk(KERN_WARNING
+                       "lockd_up: kthread_run failed, error=%d\n", error);
+               goto out_task;
+       }
+       dprintk("lockd_up: service started\n");
+       return 0;
+
+out_task:
+       svc_exit_thread(nlmsvc_rqst);
+       nlmsvc_task = NULL;
+out_rqst:
+       nlmsvc_rqst = NULL;
+       return error;
+}
+
+static struct svc_serv *lockd_create_svc(void)
 {
        struct svc_serv *serv;
-       int             error = 0;
 
-       mutex_lock(&nlmsvc_mutex);
        /*
         * Check whether we're already up and running.
         */
        if (nlmsvc_rqst) {
-               error = lockd_up_net(net);
-               goto out;
+               /*
+                * Note: increase service usage, because later in case of error
+                * svc_destroy() will be called.
+                */
+               svc_get(nlmsvc_rqst->rq_server);
+               return nlmsvc_rqst->rq_server;
        }
 
        /*
@@ -317,59 +357,53 @@ int lockd_up(struct net *net)
                printk(KERN_WARNING
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
 
-       error = -ENOMEM;
        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
-               goto out;
+               return ERR_PTR(-ENOMEM);
        }
+       dprintk("lockd_up: service created\n");
+       return serv;
+}
 
-       error = make_socks(serv, net);
-       if (error < 0)
-               goto destroy_and_out;
+/*
+ * Bring up the lockd process if it's not already up.
+ */
+int lockd_up(struct net *net)
+{
+       struct svc_serv *serv;
+       int error;
 
-       /*
-        * Create the kernel thread and wait for it to start.
-        */
-       nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-       if (IS_ERR(nlmsvc_rqst)) {
-               error = PTR_ERR(nlmsvc_rqst);
-               nlmsvc_rqst = NULL;
-               printk(KERN_WARNING
-                       "lockd_up: svc_rqst allocation failed, error=%d\n",
-                       error);
-               goto destroy_and_out;
+       mutex_lock(&nlmsvc_mutex);
+
+       serv = lockd_create_svc();
+       if (IS_ERR(serv)) {
+               error = PTR_ERR(serv);
+               goto err_create;
        }
 
-       svc_sock_update_bufs(serv);
-       serv->sv_maxconn = nlm_max_connections;
+       error = lockd_up_net(serv, net);
+       if (error < 0)
+               goto err_net;
 
-       nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
-       if (IS_ERR(nlmsvc_task)) {
-               error = PTR_ERR(nlmsvc_task);
-               svc_exit_thread(nlmsvc_rqst);
-               nlmsvc_task = NULL;
-               nlmsvc_rqst = NULL;
-               printk(KERN_WARNING
-                       "lockd_up: kthread_run failed, error=%d\n", error);
-               goto destroy_and_out;
-       }
+       error = lockd_start_svc(serv);
+       if (error < 0)
+               goto err_start;
 
+       nlmsvc_users++;
        /*
         * Note: svc_serv structures have an initial use count of 1,
         * so we exit through here on both success and failure.
         */
-destroy_and_out:
+err_net:
        svc_destroy(serv);
-out:
-       if (!error) {
-               struct lockd_net *ln = net_generic(net, lockd_net_id);
-
-               ln->nlmsvc_users++;
-               nlmsvc_users++;
-       }
+err_create:
        mutex_unlock(&nlmsvc_mutex);
        return error;
+
+err_start:
+       lockd_down_net(serv, net);
+       goto err_net;
 }
 EXPORT_SYMBOL_GPL(lockd_up);
 
@@ -380,11 +414,10 @@ void
 lockd_down(struct net *net)
 {
        mutex_lock(&nlmsvc_mutex);
+       lockd_down_net(nlmsvc_rqst->rq_server, net);
        if (nlmsvc_users) {
-               if (--nlmsvc_users) {
-                       lockd_down_net(net);
+               if (--nlmsvc_users)
                        goto out;
-               }
        } else {
                printk(KERN_ERR "lockd_down: no users! task=%p\n",
                        nlmsvc_task);
@@ -396,7 +429,9 @@ lockd_down(struct net *net)
                BUG();
        }
        kthread_stop(nlmsvc_task);
+       dprintk("lockd_down: service stopped\n");
        svc_exit_thread(nlmsvc_rqst);
+       dprintk("lockd_down: service destroyed\n");
        nlmsvc_task = NULL;
        nlmsvc_rqst = NULL;
 out:
index 4f441e46cef47bc67b08a3e82b78f389dfbbf818..814c51d0de4739e4b89e9091e00c17f284c0e2ba 100644 (file)
@@ -1636,12 +1636,13 @@ EXPORT_SYMBOL(flock_lock_file_wait);
 SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
 {
        struct file *filp;
+       int fput_needed;
        struct file_lock *lock;
        int can_sleep, unlock;
        int error;
 
        error = -EBADF;
-       filp = fget(fd);
+       filp = fget_light(fd, &fput_needed);
        if (!filp)
                goto out;
 
@@ -1674,7 +1675,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
        locks_free_lock(lock);
 
  out_putf:
-       fput(filp);
+       fput_light(filp, fput_needed);
  out:
        return error;
 }
index c651f02c9fecb930c97a2668adc075090c04b7c9..7d694194024ac4d2459e7cc3d60014bdff64e3ba 100644 (file)
@@ -449,7 +449,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
        mntget(nd->path.mnt);
 
        rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        nd->flags &= ~LOOKUP_RCU;
        return 0;
 
@@ -507,14 +507,14 @@ static int complete_walk(struct nameidata *nd)
                if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
                        spin_unlock(&dentry->d_lock);
                        rcu_read_unlock();
-                       br_read_unlock(vfsmount_lock);
+                       br_read_unlock(&vfsmount_lock);
                        return -ECHILD;
                }
                BUG_ON(nd->inode != dentry->d_inode);
                spin_unlock(&dentry->d_lock);
                mntget(nd->path.mnt);
                rcu_read_unlock();
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
        }
 
        if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -681,15 +681,15 @@ int follow_up(struct path *path)
        struct mount *parent;
        struct dentry *mountpoint;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        parent = mnt->mnt_parent;
        if (&parent->mnt == path->mnt) {
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                return 0;
        }
        mntget(&parent->mnt);
        mountpoint = dget(mnt->mnt_mountpoint);
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        dput(path->dentry);
        path->dentry = mountpoint;
        mntput(path->mnt);
@@ -947,7 +947,7 @@ failed:
        if (!(nd->flags & LOOKUP_ROOT))
                nd->root.mnt = NULL;
        rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        return -ECHILD;
 }
 
@@ -1125,8 +1125,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
  *  small and for now I'd prefer to have fast path as straight as possible.
  *  It _is_ time-critical.
  */
-static int do_lookup(struct nameidata *nd, struct qstr *name,
-                       struct path *path, struct inode **inode)
+static int lookup_fast(struct nameidata *nd, struct qstr *name,
+                      struct path *path, struct inode **inode)
 {
        struct vfsmount *mnt = nd->path.mnt;
        struct dentry *dentry, *parent = nd->path.dentry;
@@ -1208,7 +1208,7 @@ unlazy:
                        goto need_lookup;
                }
        }
-done:
+
        path->mnt = mnt;
        path->dentry = dentry;
        err = follow_managed(path, nd->flags);
@@ -1222,6 +1222,17 @@ done:
        return 0;
 
 need_lookup:
+       return 1;
+}
+
+/* Fast lookup failed, do it the slow way */
+static int lookup_slow(struct nameidata *nd, struct qstr *name,
+                      struct path *path)
+{
+       struct dentry *dentry, *parent;
+       int err;
+
+       parent = nd->path.dentry;
        BUG_ON(nd->inode != parent->d_inode);
 
        mutex_lock(&parent->d_inode->i_mutex);
@@ -1229,7 +1240,16 @@ need_lookup:
        mutex_unlock(&parent->d_inode->i_mutex);
        if (IS_ERR(dentry))
                return PTR_ERR(dentry);
-       goto done;
+       path->mnt = nd->path.mnt;
+       path->dentry = dentry;
+       err = follow_managed(path, nd->flags);
+       if (unlikely(err < 0)) {
+               path_put_conditional(path, nd);
+               return err;
+       }
+       if (err)
+               nd->flags |= LOOKUP_JUMPED;
+       return 0;
 }
 
 static inline int may_lookup(struct nameidata *nd)
@@ -1265,7 +1285,7 @@ static void terminate_walk(struct nameidata *nd)
                if (!(nd->flags & LOOKUP_ROOT))
                        nd->root.mnt = NULL;
                rcu_read_unlock();
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
        }
 }
 
@@ -1301,21 +1321,26 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
         */
        if (unlikely(type != LAST_NORM))
                return handle_dots(nd, type);
-       err = do_lookup(nd, name, path, &inode);
+       err = lookup_fast(nd, name, path, &inode);
        if (unlikely(err)) {
-               terminate_walk(nd);
-               return err;
-       }
-       if (!inode) {
-               path_to_nameidata(path, nd);
-               terminate_walk(nd);
-               return -ENOENT;
+               if (err < 0)
+                       goto out_err;
+
+               err = lookup_slow(nd, name, path);
+               if (err < 0)
+                       goto out_err;
+
+               inode = path->dentry->d_inode;
        }
+       err = -ENOENT;
+       if (!inode)
+               goto out_path_put;
+
        if (should_follow_link(inode, follow)) {
                if (nd->flags & LOOKUP_RCU) {
                        if (unlikely(unlazy_walk(nd, path->dentry))) {
-                               terminate_walk(nd);
-                               return -ECHILD;
+                               err = -ECHILD;
+                               goto out_err;
                        }
                }
                BUG_ON(inode != path->dentry->d_inode);
@@ -1324,6 +1349,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
        path_to_nameidata(path, nd);
        nd->inode = inode;
        return 0;
+
+out_path_put:
+       path_to_nameidata(path, nd);
+out_err:
+       terminate_walk(nd);
+       return err;
 }
 
 /*
@@ -1620,7 +1651,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                nd->path = nd->root;
                nd->inode = inode;
                if (flags & LOOKUP_RCU) {
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                } else {
@@ -1633,7 +1664,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 
        if (*name=='/') {
                if (flags & LOOKUP_RCU) {
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
                        set_root_rcu(nd);
                } else {
@@ -1646,7 +1677,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                        struct fs_struct *fs = current->fs;
                        unsigned seq;
 
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
 
                        do {
@@ -1682,7 +1713,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                        if (fput_needed)
                                *fp = file;
                        nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                        rcu_read_lock();
                } else {
                        path_get(&file->f_path);
@@ -2169,6 +2200,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        int want_write = 0;
        int acc_mode = op->acc_mode;
        struct file *filp;
+       struct inode *inode;
+       int symlink_ok = 0;
+       struct path save_parent = { .dentry = NULL, .mnt = NULL };
+       bool retried = false;
        int error;
 
        nd->flags &= ~LOOKUP_PARENT;
@@ -2200,30 +2235,23 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        }
 
        if (!(open_flag & O_CREAT)) {
-               int symlink_ok = 0;
                if (nd->last.name[nd->last.len])
                        nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
                        symlink_ok = 1;
                /* we _can_ be in RCU mode here */
-               error = walk_component(nd, path, &nd->last, LAST_NORM,
-                                       !symlink_ok);
-               if (error < 0)
-                       return ERR_PTR(error);
-               if (error) /* symlink */
-                       return NULL;
-               /* sayonara */
-               error = complete_walk(nd);
-               if (error)
-                       return ERR_PTR(error);
+               error = lookup_fast(nd, &nd->last, path, &inode);
+               if (unlikely(error)) {
+                       if (error < 0)
+                               goto exit;
 
-               error = -ENOTDIR;
-               if (nd->flags & LOOKUP_DIRECTORY) {
-                       if (!nd->inode->i_op->lookup)
+                       error = lookup_slow(nd, &nd->last, path);
+                       if (error < 0)
                                goto exit;
+
+                       inode = path->dentry->d_inode;
                }
-               audit_inode(pathname, nd->path.dentry);
-               goto ok;
+               goto finish_lookup;
        }
 
        /* create side of things */
@@ -2241,6 +2269,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (nd->last.name[nd->last.len])
                goto exit;
 
+retry_lookup:
        mutex_lock(&dir->d_inode->i_mutex);
 
        dentry = lookup_hash(nd);
@@ -2302,22 +2331,49 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        if (error)
                nd->flags |= LOOKUP_JUMPED;
 
+       BUG_ON(nd->flags & LOOKUP_RCU);
+       inode = path->dentry->d_inode;
+finish_lookup:
+       /* we _can_ be in RCU mode here */
        error = -ENOENT;
-       if (!path->dentry->d_inode)
-               goto exit_dput;
+       if (!inode) {
+               path_to_nameidata(path, nd);
+               goto exit;
+       }
 
-       if (path->dentry->d_inode->i_op->follow_link)
+       if (should_follow_link(inode, !symlink_ok)) {
+               if (nd->flags & LOOKUP_RCU) {
+                       if (unlikely(unlazy_walk(nd, path->dentry))) {
+                               error = -ECHILD;
+                               goto exit;
+                       }
+               }
+               BUG_ON(inode != path->dentry->d_inode);
                return NULL;
+       }
 
-       path_to_nameidata(path, nd);
-       nd->inode = path->dentry->d_inode;
+       if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
+               path_to_nameidata(path, nd);
+       } else {
+               save_parent.dentry = nd->path.dentry;
+               save_parent.mnt = mntget(path->mnt);
+               nd->path.dentry = path->dentry;
+
+       }
+       nd->inode = inode;
        /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
        error = complete_walk(nd);
-       if (error)
+       if (error) {
+               path_put(&save_parent);
                return ERR_PTR(error);
+       }
        error = -EISDIR;
-       if (S_ISDIR(nd->inode->i_mode))
+       if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
+               goto exit;
+       error = -ENOTDIR;
+       if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
                goto exit;
+       audit_inode(pathname, nd->path.dentry);
 ok:
        if (!S_ISREG(nd->inode->i_mode))
                will_truncate = 0;
@@ -2333,6 +2389,20 @@ common:
        if (error)
                goto exit;
        filp = nameidata_to_filp(nd);
+       if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) {
+               BUG_ON(save_parent.dentry != dir);
+               path_put(&nd->path);
+               nd->path = save_parent;
+               nd->inode = dir->d_inode;
+               save_parent.mnt = NULL;
+               save_parent.dentry = NULL;
+               if (want_write) {
+                       mnt_drop_write(nd->path.mnt);
+                       want_write = 0;
+               }
+               retried = true;
+               goto retry_lookup;
+       }
        if (!IS_ERR(filp)) {
                error = ima_file_check(filp, op->acc_mode);
                if (error) {
@@ -2352,7 +2422,8 @@ common:
 out:
        if (want_write)
                mnt_drop_write(nd->path.mnt);
-       path_put(&nd->path);
+       path_put(&save_parent);
+       terminate_walk(nd);
        return filp;
 
 exit_mutex_unlock:
@@ -2415,6 +2486,12 @@ out:
        if (base)
                fput(base);
        release_open_intent(nd);
+       if (filp == ERR_PTR(-EOPENSTALE)) {
+               if (flags & LOOKUP_RCU)
+                       filp = ERR_PTR(-ECHILD);
+               else
+                       filp = ERR_PTR(-ESTALE);
+       }
        return filp;
 
 out_filp:
index e6081996c9a2f9d26525740545445630c4737583..1e4a5fe3d7b7f789d66839f37b1f917c1fa3e2ba 100644 (file)
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct mount *mnt)
 {
        int ret = 0;
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
        /*
         * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct mount *mnt)
         */
        smp_wmb();
        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        return ret;
 }
 
 static void __mnt_unmake_readonly(struct mount *mnt)
 {
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        mnt->mnt.mnt_flags &= ~MNT_READONLY;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 }
 
 int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
        if (atomic_long_read(&sb->s_remove_count))
                return -EBUSY;
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
                        mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
                if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
                        mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
        return err;
 }
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path *path)
 {
        struct mount *child_mnt;
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
        if (child_mnt) {
                mnt_add_count(child_mnt, 1);
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                return &child_mnt->mnt;
        } else {
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                return NULL;
        }
 }
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
        mnt->mnt.mnt_sb = root->d_sb;
        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
        mnt->mnt_parent = mnt;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        return &mnt->mnt;
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                mnt->mnt.mnt_root = dget(root);
                mnt->mnt_mountpoint = mnt->mnt.mnt_root;
                mnt->mnt_parent = mnt;
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
 
                if (flag & CL_SLAVE) {
                        list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct mount *mnt)
 {
 put_again:
 #ifdef CONFIG_SMP
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        if (likely(atomic_read(&mnt->mnt_longterm))) {
                mnt_add_count(mnt, -1);
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                return;
        }
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        mnt_add_count(mnt, -1);
        if (mnt_get_count(mnt)) {
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                return;
        }
 #else
        mnt_add_count(mnt, -1);
        if (likely(mnt_get_count(mnt)))
                return;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 #endif
        if (unlikely(mnt->mnt_pinned)) {
                mnt_add_count(mnt, mnt->mnt_pinned + 1);
                mnt->mnt_pinned = 0;
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                acct_auto_close_mnt(&mnt->mnt);
                goto put_again;
        }
+
        list_del(&mnt->mnt_instance);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        mntfree(mnt);
 }
 
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
 
 void mnt_pin(struct vfsmount *mnt)
 {
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        real_mount(mnt)->mnt_pinned++;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_pin);
 
 void mnt_unpin(struct vfsmount *m)
 {
        struct mount *mnt = real_mount(m);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        if (mnt->mnt_pinned) {
                mnt_add_count(mnt, 1);
                mnt->mnt_pinned--;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 }
 EXPORT_SYMBOL(mnt_unpin);
 
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
        BUG_ON(!m);
 
        /* write lock needed for mnt_get_count */
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                actual_refs += mnt_get_count(p);
                minimum_refs += 2;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
        if (actual_refs > minimum_refs)
                return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
 {
        int ret = 1;
        down_read(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        if (propagate_mount_busy(real_mount(mnt), 2))
                ret = 0;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_read(&namespace_sem);
        return ret;
 }
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *head)
                        struct dentry *dentry;
                        struct mount *m;
 
-                       br_write_lock(vfsmount_lock);
+                       br_write_lock(&vfsmount_lock);
                        dentry = mnt->mnt_mountpoint;
                        m = mnt->mnt_parent;
                        mnt->mnt_mountpoint = mnt->mnt.mnt_root;
                        mnt->mnt_parent = mnt;
                        m->mnt_ghosts--;
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                        dput(dentry);
                        mntput(&m->mnt);
                }
@@ -1073,8 +1074,9 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
+               if (p->mnt_ns)
+                       __mnt_make_shortterm(p);
                p->mnt_ns = NULL;
-               __mnt_make_shortterm(p);
                list_del_init(&p->mnt_child);
                if (mnt_has_parent(p)) {
                        p->mnt_parent->mnt_ghosts++;
@@ -1112,12 +1114,12 @@ static int do_umount(struct mount *mnt, int flags)
                 * probably don't strictly need the lock here if we examined
                 * all race cases, but it's a slowpath.
                 */
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                if (mnt_get_count(mnt) != 2) {
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                        return -EBUSY;
                }
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
 
                if (!xchg(&mnt->mnt_expiry_mark, 1))
                        return -EAGAIN;
@@ -1159,7 +1161,7 @@ static int do_umount(struct mount *mnt, int flags)
        }
 
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        event++;
 
        if (!(flags & MNT_DETACH))
@@ -1171,7 +1173,7 @@ static int do_umount(struct mount *mnt, int flags)
                        umount_tree(mnt, 1, &umount_list);
                retval = 0;
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        return retval;
@@ -1286,19 +1288,19 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                        q = clone_mnt(p, p->mnt.mnt_root, flag);
                        if (!q)
                                goto Enomem;
-                       br_write_lock(vfsmount_lock);
+                       br_write_lock(&vfsmount_lock);
                        list_add_tail(&q->mnt_list, &res->mnt_list);
                        attach_mnt(q, &path);
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                }
        }
        return res;
 Enomem:
        if (res) {
                LIST_HEAD(umount_list);
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                umount_tree(res, 0, &umount_list);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                release_mounts(&umount_list);
        }
        return NULL;
@@ -1318,9 +1320,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
 {
        LIST_HEAD(umount_list);
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        umount_tree(real_mount(mnt), 0, &umount_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
 }
@@ -1448,7 +1450,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
        if (err)
                goto out_cleanup_ids;
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 
        if (IS_MNT_SHARED(dest_mnt)) {
                for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1469,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                list_del_init(&child->mnt_hash);
                commit_tree(child);
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
        return 0;
 
@@ -1565,10 +1567,10 @@ static int do_change_type(struct path *path, int flag)
                        goto out_unlock;
        }
 
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                change_mnt_propagation(m, type);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
  out_unlock:
        up_write(&namespace_sem);
@@ -1617,9 +1619,9 @@ static int do_loopback(struct path *path, char *old_name,
 
        err = graft_tree(mnt, path);
        if (err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                umount_tree(mnt, 0, &umount_list);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
        }
 out2:
        unlock_mount(path);
@@ -1677,16 +1679,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
        else
                err = do_remount_sb(sb, flags, data, 0);
        if (!err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
                mnt->mnt.mnt_flags = mnt_flags;
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
        }
        up_write(&sb->s_umount);
        if (!err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                touch_mnt_namespace(mnt->mnt_ns);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
        }
        return err;
 }
@@ -1893,9 +1895,9 @@ fail:
        /* remove m from any expiration list it may be on */
        if (!list_empty(&mnt->mnt_expire)) {
                down_write(&namespace_sem);
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                list_del_init(&mnt->mnt_expire);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                up_write(&namespace_sem);
        }
        mntput(m);
@@ -1911,11 +1913,11 @@ fail:
 void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
 {
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 
        list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
 
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
 }
 EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1937,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                return;
 
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
 
        /* extract from the expiration list every vfsmount that matches the
         * following criteria:
@@ -1954,7 +1956,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                touch_mnt_namespace(mnt->mnt_ns);
                umount_tree(mnt, 1, &umounts);
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
 
        release_mounts(&umounts);
@@ -2218,9 +2220,9 @@ void mnt_make_shortterm(struct vfsmount *m)
        struct mount *mnt = real_mount(m);
        if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
                return;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        atomic_dec(&mnt->mnt_longterm);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 #endif
 }
 
@@ -2250,9 +2252,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                return ERR_PTR(-ENOMEM);
        }
        new_ns->root = new;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        list_add_tail(&new_ns->list, &new->mnt_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
 
        /*
         * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2418,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 int path_is_under(struct path *path1, struct path *path2)
 {
        int res;
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
        return res;
 }
 EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2507,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        /* make sure we can reach put_old from new_root */
        if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
                goto out4;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        detach_mnt(new_mnt, &parent_path);
        detach_mnt(root_mnt, &root_parent);
        /* mount old root on put_old */
@@ -2513,7 +2515,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
        /* mount new_root on / */
        attach_mnt(new_mnt, &root_parent);
        touch_mnt_namespace(current->nsproxy->mnt_ns);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        chroot_fs_refs(&root, &new);
        error = 0;
 out4:
@@ -2576,7 +2578,7 @@ void __init mnt_init(void)
        for (u = 0; u < HASH_SIZE; u++)
                INIT_LIST_HEAD(&mount_hashtable[u]);
 
-       br_lock_init(vfsmount_lock);
+       br_lock_init(&vfsmount_lock);
 
        err = sysfs_init();
        if (err)
@@ -2596,9 +2598,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
        if (!atomic_dec_and_test(&ns->count))
                return;
        down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        umount_tree(ns->root, 0, &umount_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        up_write(&namespace_sem);
        release_mounts(&umount_list);
        kfree(ns);
index 3ff5fcc1528fd21ae18a7a240ec9f2920ec30d32..122e260247f53c663550073fda567a4342b0ba63 100644 (file)
@@ -221,6 +221,10 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
 
        already_written = 0;
 
+       errno = file_update_time(file);
+       if (errno)
+               goto outrel;
+
        bouncebuffer = vmalloc(bufsize);
        if (!bouncebuffer) {
                errno = -EIO;   /* -ENOMEM */
@@ -252,8 +256,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
        }
        vfree(bouncebuffer);
 
-       file_update_time(file);
-
        *ppos = pos;
 
        if (pos > i_size_read(inode)) {
index 4af803f13516c98deaf7372af2dda0499e329fe6..54cc0cdb3dcbda111e24a3a67e7953e5173dd07e 100644 (file)
@@ -23,17 +23,17 @@ struct ncp_mount_data_kernel {
        unsigned long    flags;         /* NCP_MOUNT_* flags */
        unsigned int     int_flags;     /* internal flags */
 #define NCP_IMOUNT_LOGGEDIN_POSSIBLE   0x0001
-       __kernel_uid32_t mounted_uid;   /* Who may umount() this filesystem? */
+       uid_t            mounted_uid;   /* Who may umount() this filesystem? */
        struct pid      *wdog_pid;      /* Who cares for our watchdog packets? */
        unsigned int     ncp_fd;        /* The socket to the ncp port */
        unsigned int     time_out;      /* How long should I wait after
                                           sending a NCP request? */
        unsigned int     retry_count;   /* And how often should I retry? */
        unsigned char    mounted_vol[NCP_VOLNAME_LEN + 1];
-       __kernel_uid32_t uid;
-       __kernel_gid32_t gid;
-       __kernel_mode_t  file_mode;
-       __kernel_mode_t  dir_mode;
+       uid_t            uid;
+       gid_t            gid;
+       umode_t          file_mode;
+       umode_t          dir_mode;
        int              info_fd;
 };
 
index eb95f5091c1aff93930e17a829a808023edc2e12..970659daa323865a113d25075d461c50c7f7dc7c 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/kthread.h>
 #include <linux/sunrpc/svcauth_gss.h>
 #include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
 
 #include <net/inet_sock.h>
 
@@ -253,6 +254,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
        char svc_name[12];
        int ret = 0;
        int minorversion_setup;
+       struct net *net = current->nsproxy->net_ns;
 
        mutex_lock(&nfs_callback_mutex);
        if (cb_info->users++ || cb_info->task != NULL) {
@@ -265,6 +267,12 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
                goto out_err;
        }
 
+       ret = svc_bind(serv, net);
+       if (ret < 0) {
+               printk(KERN_WARNING "NFS: bind callback service failed\n");
+               goto out_err;
+       }
+
        minorversion_setup =  nfs_minorversion_callback_svc_setup(minorversion,
                                        serv, xprt, &rqstp, &callback_svc);
        if (!minorversion_setup) {
@@ -306,6 +314,8 @@ out_err:
        dprintk("NFS: Couldn't create callback socket or server thread; "
                "err = %d\n", ret);
        cb_info->users--;
+       if (serv)
+               svc_shutdown_net(serv, net);
        goto out;
 }
 
@@ -320,6 +330,7 @@ void nfs_callback_down(int minorversion)
        cb_info->users--;
        if (cb_info->users == 0 && cb_info->task != NULL) {
                kthread_stop(cb_info->task);
+               svc_shutdown_net(cb_info->serv, current->nsproxy->net_ns);
                svc_exit_thread(cb_info->rqst);
                cb_info->serv = NULL;
                cb_info->rqst = NULL;
@@ -332,7 +343,7 @@ void nfs_callback_down(int minorversion)
 int
 check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
 {
-       char *p = svc_gss_principal(rqstp);
+       char *p = rqstp->rq_cred.cr_principal;
 
        if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
                return 1;
index 0989a2099688a377279d76f4f8c56dbc91070027..f430057ff3b397c2fe1f523bf5fcea4135276f8c 100644 (file)
@@ -1354,10 +1354,10 @@ out:
 }
 
 #ifdef CONFIG_NFS_V4
-static int nfs_open_revalidate(struct dentry *, struct nameidata *);
+static int nfs4_lookup_revalidate(struct dentry *, struct nameidata *);
 
 const struct dentry_operations nfs4_dentry_operations = {
-       .d_revalidate   = nfs_open_revalidate,
+       .d_revalidate   = nfs4_lookup_revalidate,
        .d_delete       = nfs_dentry_delete,
        .d_iput         = nfs_dentry_iput,
        .d_automount    = nfs_d_automount,
@@ -1519,13 +1519,11 @@ no_open:
        return nfs_lookup(dir, dentry, nd);
 }
 
-static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct dentry *parent = NULL;
        struct inode *inode;
        struct inode *dir;
-       struct nfs_open_context *ctx;
-       struct iattr attr;
        int openflags, ret = 0;
 
        if (nd->flags & LOOKUP_RCU)
@@ -1554,57 +1552,13 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
        /* We cannot do exclusive creation on a positive dentry */
        if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
                goto no_open_dput;
-       /* We can't create new files here */
-       openflags &= ~(O_CREAT|O_EXCL);
-
-       ctx = create_nfs_open_context(dentry, openflags);
-       ret = PTR_ERR(ctx);
-       if (IS_ERR(ctx))
-               goto out;
 
-       attr.ia_valid = ATTR_OPEN;
-       if (openflags & O_TRUNC) {
-               attr.ia_valid |= ATTR_SIZE;
-               attr.ia_size = 0;
-               nfs_wb_all(inode);
-       }
-
-       /*
-        * Note: we're not holding inode->i_mutex and so may be racing with
-        * operations that change the directory. We therefore save the
-        * change attribute *before* we do the RPC call.
-        */
-       inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
-       if (IS_ERR(inode)) {
-               ret = PTR_ERR(inode);
-               switch (ret) {
-               case -EPERM:
-               case -EACCES:
-               case -EDQUOT:
-               case -ENOSPC:
-               case -EROFS:
-                       goto out_put_ctx;
-               default:
-                       goto out_drop;
-               }
-       }
-       iput(inode);
-       if (inode != dentry->d_inode)
-               goto out_drop;
+       /* Let f_op->open() actually open (and revalidate) the file */
+       ret = 1;
 
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-       ret = nfs_intent_set_file(nd, ctx);
-       if (ret >= 0)
-               ret = 1;
 out:
        dput(parent);
        return ret;
-out_drop:
-       d_drop(dentry);
-       ret = 0;
-out_put_ctx:
-       put_nfs_open_context(ctx);
-       goto out;
 
 no_open_dput:
        dput(parent);
index 56311ca5f9f8183d3aa8c3aa8c9922db05ee32a3..a6708e6b438dd55f2924e5bb78c809c1575a97a9 100644 (file)
@@ -879,12 +879,81 @@ const struct file_operations nfs_file_operations = {
 static int
 nfs4_file_open(struct inode *inode, struct file *filp)
 {
+       struct nfs_open_context *ctx;
+       struct dentry *dentry = filp->f_path.dentry;
+       struct dentry *parent = NULL;
+       struct inode *dir;
+       unsigned openflags = filp->f_flags;
+       struct iattr attr;
+       int err;
+
+       BUG_ON(inode != dentry->d_inode);
        /*
-        * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
-        * this point, then something is very wrong
+        * If no cached dentry exists or if it's negative, NFSv4 handled the
+        * opens in ->lookup() or ->create().
+        *
+        * We only get this far for a cached positive dentry.  We skipped
+        * revalidation, so handle it here by dropping the dentry and returning
+        * -EOPENSTALE.  The VFS will retry the lookup/create/open.
         */
-       dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
-       return -ENOTDIR;
+
+       dprintk("NFS: open file(%s/%s)\n",
+               dentry->d_parent->d_name.name,
+               dentry->d_name.name);
+
+       if ((openflags & O_ACCMODE) == 3)
+               openflags--;
+
+       /* We can't create new files here */
+       openflags &= ~(O_CREAT|O_EXCL);
+
+       parent = dget_parent(dentry);
+       dir = parent->d_inode;
+
+       ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+       err = PTR_ERR(ctx);
+       if (IS_ERR(ctx))
+               goto out;
+
+       attr.ia_valid = ATTR_OPEN;
+       if (openflags & O_TRUNC) {
+               attr.ia_valid |= ATTR_SIZE;
+               attr.ia_size = 0;
+               nfs_wb_all(inode);
+       }
+
+       inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
+       if (IS_ERR(inode)) {
+               err = PTR_ERR(inode);
+               switch (err) {
+               case -EPERM:
+               case -EACCES:
+               case -EDQUOT:
+               case -ENOSPC:
+               case -EROFS:
+                       goto out_put_ctx;
+               default:
+                       goto out_drop;
+               }
+       }
+       iput(inode);
+       if (inode != dentry->d_inode)
+               goto out_drop;
+
+       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+       nfs_file_set_open_context(filp, ctx);
+       err = 0;
+
+out_put_ctx:
+       put_nfs_open_context(ctx);
+out:
+       dput(parent);
+       return err;
+
+out_drop:
+       d_drop(dentry);
+       err = -EOPENSTALE;
+       goto out_put_ctx;
 }
 
 const struct file_operations nfs4_file_operations = {
index 204438cc914ea522b83907aaf618bb0dbcfa4068..34a10d78b839f4c73b3d851e19820bc712129f36 100644 (file)
@@ -11,7 +11,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
        struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
 
        for (f = exp->ex_flavors; f < end; f++) {
-               if (f->pseudoflavor == rqstp->rq_flavor)
+               if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
                        return f->flags;
        }
        return exp->ex_flags;
index dcb52b8845194db72aa37bb05f5f4ea2d49b9691..ba233499b9a5fc1b374bc7d79ad8f636f01135b0 100644 (file)
@@ -706,7 +706,7 @@ static struct cache_head *svc_export_alloc(void)
                return NULL;
 }
 
-struct cache_detail svc_export_cache_template = {
+static struct cache_detail svc_export_cache_template = {
        .owner          = THIS_MODULE,
        .hash_size      = EXPORT_HASHMAX,
        .name           = "nfsd.export",
@@ -904,13 +904,13 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
                return 0;
        /* ip-address based client; check sec= export option: */
        for (f = exp->ex_flavors; f < end; f++) {
-               if (f->pseudoflavor == rqstp->rq_flavor)
+               if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
                        return 0;
        }
        /* defaults in absence of sec= options: */
        if (exp->ex_nflavors == 0) {
-               if (rqstp->rq_flavor == RPC_AUTH_NULL ||
-                   rqstp->rq_flavor == RPC_AUTH_UNIX)
+               if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL ||
+                   rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
                        return 0;
        }
        return nfserr_wrongsec;
index 9559ce468732e7c00ae40cd4fc3a379a525ecef7..e6c38159622fe6bc337f3d24ada7db683838ceac 100644 (file)
@@ -58,6 +58,7 @@ static int nfsd_inject_set(void *op_ptr, u64 val)
 
 static int nfsd_inject_get(void *data, u64 *val)
 {
+       *val = 0;
        return 0;
 }
 
index c8e9f637153ab3e44ba293f7097e7b32e77d4e54..a5fd6b982f277ce648bbd528947964ea2ef63c73 100644 (file)
@@ -650,9 +650,10 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
        struct rpc_clnt *client;
 
        if (clp->cl_minorversion == 0) {
-               if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+               if (!clp->cl_cred.cr_principal &&
+                               (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
                        return -EINVAL;
-               args.client_name = clp->cl_principal;
+               args.client_name = clp->cl_cred.cr_principal;
                args.prognumber = conn->cb_prog,
                args.protocol = XPRT_TRANSPORT_TCP;
                args.authflavor = clp->cl_flavor;
index 286a7f8f2024fa9667d16a0c4d1444fb0de5be98..dae36f1dee95e68defce943bedf01efc46d61a54 100644 (file)
@@ -605,7 +605,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
 static __be32
 do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
 {
-       if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+       if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
                if (numeric_name_to_id(rqstp, type, name, namelen, id))
                        return 0;
                /*
@@ -618,7 +618,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
 static int
 do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
 {
-       if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+       if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
                return sprintf(name, "%u", id);
        return idmap_id_to_name(rqstp, type, id, name);
 }
index ed3f9206a0ee87c914f133492f1f6011775bdef8..5ff0b7b9fc08f22f39cc1f2d83062baceb773bdc 100644 (file)
@@ -570,7 +570,7 @@ static ssize_t
 cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 {
        struct cld_upcall *tmp, *cup;
-       struct cld_msg *cmsg = (struct cld_msg *)src;
+       struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
        uint32_t xid;
        struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
                                                nfsd_net_id);
@@ -1029,7 +1029,7 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
        return ret;
 }
 
-struct notifier_block nfsd4_cld_block = {
+static struct notifier_block nfsd4_cld_block = {
        .notifier_call = rpc_pipefs_event,
 };
 
index 03f82c0bc35d725b7b460a9ef0ef93fd80409e38..8fdc9ec5c5d359f8defb2766e710eb35fc08c3b0 100644 (file)
@@ -42,6 +42,7 @@
 #include <linux/sunrpc/clnt.h>
 #include "xdr4.h"
 #include "vfs.h"
+#include "current_stateid.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
 
@@ -447,37 +448,69 @@ static struct list_head close_lru;
  *
  * which we should reject.
  */
-static void
-set_access(unsigned int *access, unsigned long bmap) {
+static unsigned int
+bmap_to_share_mode(unsigned long bmap) {
        int i;
+       unsigned int access = 0;
 
-       *access = 0;
        for (i = 1; i < 4; i++) {
                if (test_bit(i, &bmap))
-                       *access |= i;
-       }
-}
-
-static void
-set_deny(unsigned int *deny, unsigned long bmap) {
-       int i;
-
-       *deny = 0;
-       for (i = 0; i < 4; i++) {
-               if (test_bit(i, &bmap))
-                       *deny |= i ;
+                       access |= i;
        }
+       return access;
 }
 
-static int
+static bool
 test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
        unsigned int access, deny;
 
-       set_access(&access, stp->st_access_bmap);
-       set_deny(&deny, stp->st_deny_bmap);
+       access = bmap_to_share_mode(stp->st_access_bmap);
+       deny = bmap_to_share_mode(stp->st_deny_bmap);
        if ((access & open->op_share_deny) || (deny & open->op_share_access))
-               return 0;
-       return 1;
+               return false;
+       return true;
+}
+
+/* set share access for a given stateid */
+static inline void
+set_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __set_bit(access, &stp->st_access_bmap);
+}
+
+/* clear share access for a given stateid */
+static inline void
+clear_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __clear_bit(access, &stp->st_access_bmap);
+}
+
+/* test whether a given stateid has access */
+static inline bool
+test_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       return test_bit(access, &stp->st_access_bmap);
+}
+
+/* set share deny for a given stateid */
+static inline void
+set_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __set_bit(access, &stp->st_deny_bmap);
+}
+
+/* clear share deny for a given stateid */
+static inline void
+clear_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __clear_bit(access, &stp->st_deny_bmap);
+}
+
+/* test whether a given stateid is denying specific access */
+static inline bool
+test_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+       return test_bit(access, &stp->st_deny_bmap);
 }
 
 static int nfs4_access_to_omode(u32 access)
@@ -493,6 +526,20 @@ static int nfs4_access_to_omode(u32 access)
        BUG();
 }
 
+/* release all access and file references for a given stateid */
+static void
+release_all_access(struct nfs4_ol_stateid *stp)
+{
+       int i;
+
+       for (i = 1; i < 4; i++) {
+               if (test_access(i, stp))
+                       nfs4_file_put_access(stp->st_file,
+                                            nfs4_access_to_omode(i));
+               clear_access(i, stp);
+       }
+}
+
 static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
 {
        list_del(&stp->st_perfile);
@@ -501,16 +548,7 @@ static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
 
 static void close_generic_stateid(struct nfs4_ol_stateid *stp)
 {
-       int i;
-
-       if (stp->st_access_bmap) {
-               for (i = 1; i < 4; i++) {
-                       if (test_bit(i, &stp->st_access_bmap))
-                               nfs4_file_put_access(stp->st_file,
-                                               nfs4_access_to_omode(i));
-                       __clear_bit(i, &stp->st_access_bmap);
-               }
-       }
+       release_all_access(stp);
        put_nfs4_file(stp->st_file);
        stp->st_file = NULL;
 }
@@ -885,7 +923,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
        struct nfsd4_session *new;
        struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
        int numslots, slotsize;
-       int status;
+       __be32 status;
        int idx;
 
        /*
@@ -984,7 +1022,8 @@ static inline void
 renew_client_locked(struct nfs4_client *clp)
 {
        if (is_client_expired(clp)) {
-               dprintk("%s: client (clientid %08x/%08x) already expired\n",
+               WARN_ON(1);
+               printk("%s: client (clientid %08x/%08x) already expired\n",
                        __func__,
                        clp->cl_clientid.cl_boot,
                        clp->cl_clientid.cl_id);
@@ -1049,9 +1088,7 @@ free_client(struct nfs4_client *clp)
                list_del(&ses->se_perclnt);
                nfsd4_put_session_locked(ses);
        }
-       if (clp->cl_cred.cr_group_info)
-               put_group_info(clp->cl_cred.cr_group_info);
-       kfree(clp->cl_principal);
+       free_svc_cred(&clp->cl_cred);
        kfree(clp->cl_name.data);
        kfree(clp);
 }
@@ -1132,12 +1169,21 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
        target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
 }
 
-static void copy_cred(struct svc_cred *target, struct svc_cred *source)
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
 {
+       if (source->cr_principal) {
+               target->cr_principal =
+                               kstrdup(source->cr_principal, GFP_KERNEL);
+               if (target->cr_principal == NULL)
+                       return -ENOMEM;
+       } else
+               target->cr_principal = NULL;
+       target->cr_flavor = source->cr_flavor;
        target->cr_uid = source->cr_uid;
        target->cr_gid = source->cr_gid;
        target->cr_group_info = source->cr_group_info;
        get_group_info(target->cr_group_info);
+       return 0;
 }
 
 static int same_name(const char *n1, const char *n2)
@@ -1157,11 +1203,31 @@ same_clid(clientid_t *cl1, clientid_t *cl2)
        return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
 }
 
-/* XXX what about NGROUP */
+static bool groups_equal(struct group_info *g1, struct group_info *g2)
+{
+       int i;
+
+       if (g1->ngroups != g2->ngroups)
+               return false;
+       for (i=0; i<g1->ngroups; i++)
+               if (GROUP_AT(g1, i) != GROUP_AT(g2, i))
+                       return false;
+       return true;
+}
+
 static int
 same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
 {
-       return cr1->cr_uid == cr2->cr_uid;
+       if ((cr1->cr_flavor != cr2->cr_flavor)
+               || (cr1->cr_uid != cr2->cr_uid)
+               || (cr1->cr_gid != cr2->cr_gid)
+               || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
+               return false;
+       if (cr1->cr_principal == cr2->cr_principal)
+               return true;
+       if (!cr1->cr_principal || !cr2->cr_principal)
+               return false;
+       return 0 == strcmp(cr1->cr_principal, cr1->cr_principal);
 }
 
 static void gen_clid(struct nfs4_client *clp)
@@ -1204,25 +1270,20 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
 {
        struct nfs4_client *clp;
        struct sockaddr *sa = svc_addr(rqstp);
-       char *princ;
+       int ret;
 
        clp = alloc_client(name);
        if (clp == NULL)
                return NULL;
 
        INIT_LIST_HEAD(&clp->cl_sessions);
-
-       princ = svc_gss_principal(rqstp);
-       if (princ) {
-               clp->cl_principal = kstrdup(princ, GFP_KERNEL);
-               if (clp->cl_principal == NULL) {
-                       spin_lock(&client_lock);
-                       free_client(clp);
-                       spin_unlock(&client_lock);
-                       return NULL;
-               }
+       ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+       if (ret) {
+               spin_lock(&client_lock);
+               free_client(clp);
+               spin_unlock(&client_lock);
+               return NULL;
        }
-
        idr_init(&clp->cl_stateids);
        memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
        atomic_set(&clp->cl_refcount, 0);
@@ -1240,8 +1301,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        copy_verf(clp, verf);
        rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
-       clp->cl_flavor = rqstp->rq_flavor;
-       copy_cred(&clp->cl_cred, &rqstp->rq_cred);
        gen_confirm(clp);
        clp->cl_cb_session = NULL;
        return clp;
@@ -1470,18 +1529,32 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
        clid->flags = new->cl_exchange_flags;
 }
 
+static bool client_has_state(struct nfs4_client *clp)
+{
+       /*
+        * Note clp->cl_openowners check isn't quite right: there's no
+        * need to count owners without stateid's.
+        *
+        * Also note we should probably be using this in 4.0 case too.
+        */
+       return !list_empty(&clp->cl_openowners)
+               || !list_empty(&clp->cl_delegations)
+               || !list_empty(&clp->cl_sessions);
+}
+
 __be32
 nfsd4_exchange_id(struct svc_rqst *rqstp,
                  struct nfsd4_compound_state *cstate,
                  struct nfsd4_exchange_id *exid)
 {
        struct nfs4_client *unconf, *conf, *new;
-       int status;
+       __be32 status;
        unsigned int            strhashval;
        char                    dname[HEXDIR_LEN];
        char                    addr_str[INET6_ADDRSTRLEN];
        nfs4_verifier           verf = exid->verifier;
        struct sockaddr         *sa = svc_addr(rqstp);
+       bool    update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
 
        rpc_ntop(sa, addr_str, sizeof(addr_str));
        dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1507,71 +1580,63 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
        status = nfs4_make_rec_clidname(dname, &exid->clname);
 
        if (status)
-               goto error;
+               return status;
 
        strhashval = clientstr_hashval(dname);
 
+       /* Cases below refer to rfc 5661 section 18.35.4: */
        nfs4_lock_state();
-       status = nfs_ok;
-
        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
-               if (!clp_used_exchangeid(conf)) {
-                       status = nfserr_clid_inuse; /* XXX: ? */
-                       goto out;
-               }
-               if (!same_verf(&verf, &conf->cl_verifier)) {
-                       /* 18.35.4 case 8 */
-                       if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+               bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
+               bool verfs_match = same_verf(&verf, &conf->cl_verifier);
+
+               if (update) {
+                       if (!clp_used_exchangeid(conf)) { /* buggy client */
+                               status = nfserr_inval;
+                               goto out;
+                       }
+                       if (!creds_match) { /* case 9 */
+                               status = nfserr_perm;
+                               goto out;
+                       }
+                       if (!verfs_match) { /* case 8 */
                                status = nfserr_not_same;
                                goto out;
                        }
-                       /* Client reboot: destroy old state */
-                       expire_client(conf);
-                       goto out_new;
+                       /* case 6 */
+                       exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+                       new = conf;
+                       goto out_copy;
                }
-               if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
-                       /* 18.35.4 case 9 */
-                       if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
-                               status = nfserr_perm;
+               if (!creds_match) { /* case 3 */
+                       if (client_has_state(conf)) {
+                               status = nfserr_clid_inuse;
                                goto out;
                        }
                        expire_client(conf);
                        goto out_new;
                }
-               /*
-                * Set bit when the owner id and verifier map to an already
-                * confirmed client id (18.35.3).
-                */
-               exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
-
-               /*
-                * Falling into 18.35.4 case 2, possible router replay.
-                * Leave confirmed record intact and return same result.
-                */
-               copy_verf(conf, &verf);
-               new = conf;
-               goto out_copy;
+               if (verfs_match) { /* case 2 */
+                       conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+                       new = conf;
+                       goto out_copy;
+               }
+               /* case 5, client reboot */
+               goto out_new;
        }
 
-       /* 18.35.4 case 7 */
-       if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+       if (update) { /* case 7 */
                status = nfserr_noent;
                goto out;
        }
 
        unconf  = find_unconfirmed_client_by_str(dname, strhashval);
-       if (unconf) {
-               /*
-                * Possible retry or client restart.  Per 18.35.4 case 4,
-                * a new unconfirmed record should be generated regardless
-                * of whether any properties have changed.
-                */
+       if (unconf) /* case 4, possible retry or client restart */
                expire_client(unconf);
-       }
 
+       /* case 1 (normal case) */
 out_new:
-       /* Normal case */
        new = create_client(exid->clname, dname, rqstp, &verf);
        if (new == NULL) {
                status = nfserr_jukebox;
@@ -1584,7 +1649,7 @@ out_copy:
        exid->clientid.cl_boot = new->cl_clientid.cl_boot;
        exid->clientid.cl_id = new->cl_clientid.cl_id;
 
-       exid->seqid = 1;
+       exid->seqid = new->cl_cs_slot.sl_seqid + 1;
        nfsd4_set_ex_flags(new, exid);
 
        dprintk("nfsd4_exchange_id seqid %d flags %x\n",
@@ -1593,12 +1658,10 @@ out_copy:
 
 out:
        nfs4_unlock_state();
-error:
-       dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
        return status;
 }
 
-static int
+static __be32
 check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
 {
        dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
@@ -1626,7 +1689,7 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
  */
 static void
 nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
-                          struct nfsd4_clid_slot *slot, int nfserr)
+                          struct nfsd4_clid_slot *slot, __be32 nfserr)
 {
        slot->sl_status = nfserr;
        memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
@@ -1657,7 +1720,7 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
                                /* seqid, slotID, slotID, slotID, status */ \
                        5 ) * sizeof(__be32))
 
-static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
+static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
 {
        return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
                || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
@@ -1673,7 +1736,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
        struct nfsd4_session *new;
        struct nfsd4_clid_slot *cs_slot = NULL;
        bool confirm_me = false;
-       int status = 0;
+       __be32 status = 0;
 
        if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
                return nfserr_inval;
@@ -1686,16 +1749,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                cs_slot = &conf->cl_cs_slot;
                status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                if (status == nfserr_replay_cache) {
-                       dprintk("Got a create_session replay! seqid= %d\n",
-                               cs_slot->sl_seqid);
-                       /* Return the cached reply status */
                        status = nfsd4_replay_create_session(cr_ses, cs_slot);
                        goto out;
                } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
                        status = nfserr_seq_misordered;
-                       dprintk("Sequence misordered!\n");
-                       dprintk("Expected seqid= %d but got seqid= %d\n",
-                               cs_slot->sl_seqid, cr_ses->seqid);
                        goto out;
                }
        } else if (unconf) {
@@ -1704,7 +1761,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                        status = nfserr_clid_inuse;
                        goto out;
                }
-
                cs_slot = &unconf->cl_cs_slot;
                status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                if (status) {
@@ -1712,7 +1768,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                        status = nfserr_seq_misordered;
                        goto out;
                }
-
                confirm_me = true;
                conf = unconf;
        } else {
@@ -1749,8 +1804,14 @@ nfsd4_create_session(struct svc_rqst *rqstp,
 
        /* cache solo and embedded create sessions under the state lock */
        nfsd4_cache_create_session(cr_ses, cs_slot, status);
-       if (confirm_me)
+       if (confirm_me) {
+               unsigned int hash = clientstr_hashval(unconf->cl_recdir);
+               struct nfs4_client *old =
+                       find_confirmed_client_by_str(conf->cl_recdir, hash);
+               if (old)
+                       expire_client(old);
                move_to_confirmed(conf);
+       }
 out:
        nfs4_unlock_state();
        dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1818,7 +1879,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
                      struct nfsd4_destroy_session *sessionid)
 {
        struct nfsd4_session *ses;
-       u32 status = nfserr_badsession;
+       __be32 status = nfserr_badsession;
 
        /* Notes:
         * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1914,7 +1975,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
        struct nfsd4_session *session;
        struct nfsd4_slot *slot;
        struct nfsd4_conn *conn;
-       int status;
+       __be32 status;
 
        if (resp->opcnt != 1)
                return nfserr_sequence_pos;
@@ -2008,18 +2069,11 @@ out:
        return status;
 }
 
-static inline bool has_resources(struct nfs4_client *clp)
-{
-       return !list_empty(&clp->cl_openowners)
-               || !list_empty(&clp->cl_delegations)
-               || !list_empty(&clp->cl_sessions);
-}
-
 __be32
 nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
 {
        struct nfs4_client *conf, *unconf, *clp;
-       int status = 0;
+       __be32 status = 0;
 
        nfs4_lock_state();
        unconf = find_unconfirmed_client(&dc->clientid);
@@ -2028,7 +2082,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
        if (conf) {
                clp = conf;
 
-               if (!is_client_expired(conf) && has_resources(conf)) {
+               if (!is_client_expired(conf) && client_has_state(conf)) {
                        status = nfserr_clientid_busy;
                        goto out;
                }
@@ -2055,7 +2109,7 @@ out:
 __be32
 nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
 {
-       int status = 0;
+       __be32 status = 0;
 
        if (rc->rca_one_fs) {
                if (!cstate->current_fh.fh_dentry)
@@ -2106,17 +2160,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        if (status)
                return status;
 
-       /* 
-        * XXX The Duplicate Request Cache (DRC) has been checked (??)
-        * We get here on a DRC miss.
-        */
-
        strhashval = clientstr_hashval(dname);
 
+       /* Cases below refer to rfc 3530 section 14.2.33: */
        nfs4_lock_state();
        conf = find_confirmed_client_by_str(dname, strhashval);
        if (conf) {
-               /* RFC 3530 14.2.33 CASE 0: */
+               /* case 0: */
                status = nfserr_clid_inuse;
                if (clp_used_exchangeid(conf))
                        goto out;
@@ -2129,63 +2179,18 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                        goto out;
                }
        }
-       /*
-        * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
-        * has a description of SETCLIENTID request processing consisting
-        * of 5 bullet points, labeled as CASE0 - CASE4 below.
-        */
        unconf = find_unconfirmed_client_by_str(dname, strhashval);
+       if (unconf)
+               expire_client(unconf);
        status = nfserr_jukebox;
-       if (!conf) {
-               /*
-                * RFC 3530 14.2.33 CASE 4:
-                * placed first, because it is the normal case
-                */
-               if (unconf)
-                       expire_client(unconf);
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
-               gen_clid(new);
-       } else if (same_verf(&conf->cl_verifier, &clverifier)) {
-               /*
-                * RFC 3530 14.2.33 CASE 1:
-                * probable callback update
-                */
-               if (unconf) {
-                       /* Note this is removing unconfirmed {*x***},
-                        * which is stronger than RFC recommended {vxc**}.
-                        * This has the advantage that there is at most
-                        * one {*x***} in either list at any time.
-                        */
-                       expire_client(unconf);
-               }
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
+       new = create_client(clname, dname, rqstp, &clverifier);
+       if (new == NULL)
+               goto out;
+       if (conf && same_verf(&conf->cl_verifier, &clverifier))
+               /* case 1: probable callback update */
                copy_clid(new, conf);
-       } else if (!unconf) {
-               /*
-                * RFC 3530 14.2.33 CASE 2:
-                * probable client reboot; state will be removed if
-                * confirmed.
-                */
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
-               gen_clid(new);
-       } else {
-               /*
-                * RFC 3530 14.2.33 CASE 3:
-                * probable client reboot; state will be removed if
-                * confirmed.
-                */
-               expire_client(unconf);
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
+       else /* case 4 (new client) or cases 2, 3 (client reboot): */
                gen_clid(new);
-       }
        /*
         * XXX: we should probably set this at creation time, and check
         * for consistent minorversion use throughout:
@@ -2203,17 +2208,11 @@ out:
 }
 
 
-/*
- * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
- * a description of SETCLIENTID_CONFIRM request processing consisting of 4
- * bullets, labeled as CASE1 - CASE4 below.
- */
 __be32
 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                         struct nfsd4_compound_state *cstate,
                         struct nfsd4_setclientid_confirm *setclientid_confirm)
 {
-       struct sockaddr *sa = svc_addr(rqstp);
        struct nfs4_client *conf, *unconf;
        nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
        clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -2221,84 +2220,44 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 
        if (STALE_CLIENTID(clid))
                return nfserr_stale_clientid;
-       /* 
-        * XXX The Duplicate Request Cache (DRC) has been checked (??)
-        * We get here on a DRC miss.
-        */
-
        nfs4_lock_state();
 
        conf = find_confirmed_client(clid);
        unconf = find_unconfirmed_client(clid);
-
-       status = nfserr_clid_inuse;
-       if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
-               goto out;
-       if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
-               goto out;
-
        /*
-        * section 14.2.34 of RFC 3530 has a description of
-        * SETCLIENTID_CONFIRM request processing consisting
-        * of 4 bullet points, labeled as CASE1 - CASE4 below.
+        * We try hard to give out unique clientid's, so if we get an
+        * attempt to confirm the same clientid with a different cred,
+        * there's a bug somewhere.  Let's charitably assume it's our
+        * bug.
         */
-       if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
-               /*
-                * RFC 3530 14.2.34 CASE 1:
-                * callback update
-                */
-               if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
-                       status = nfserr_clid_inuse;
-               else {
-                       nfsd4_change_callback(conf, &unconf->cl_cb_conn);
-                       nfsd4_probe_callback(conf);
-                       expire_client(unconf);
+       status = nfserr_serverfault;
+       if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
+               goto out;
+       if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
+               goto out;
+       /* cases below refer to rfc 3530 section 14.2.34: */
+       if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
+               if (conf && !unconf) /* case 2: probable retransmit */
                        status = nfs_ok;
+               else /* case 4: client hasn't noticed we rebooted yet? */
+                       status = nfserr_stale_clientid;
+               goto out;
+       }
+       status = nfs_ok;
+       if (conf) { /* case 1: callback update */
+               nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+               nfsd4_probe_callback(conf);
+               expire_client(unconf);
+       } else { /* case 3: normal case; new or rebooted client */
+               unsigned int hash = clientstr_hashval(unconf->cl_recdir);
 
+               conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+               if (conf) {
+                       nfsd4_client_record_remove(conf);
+                       expire_client(conf);
                }
-       } else if (conf && !unconf) {
-               /*
-                * RFC 3530 14.2.34 CASE 2:
-                * probable retransmitted request; play it safe and
-                * do nothing.
-                */
-               if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
-                       status = nfserr_clid_inuse;
-               else
-                       status = nfs_ok;
-       } else if (!conf && unconf
-                       && same_verf(&unconf->cl_confirm, &confirm)) {
-               /*
-                * RFC 3530 14.2.34 CASE 3:
-                * Normal case; new or rebooted client:
-                */
-               if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
-                       status = nfserr_clid_inuse;
-               } else {
-                       unsigned int hash =
-                               clientstr_hashval(unconf->cl_recdir);
-                       conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                           hash);
-                       if (conf) {
-                               nfsd4_client_record_remove(conf);
-                               expire_client(conf);
-                       }
-                       move_to_confirmed(unconf);
-                       conf = unconf;
-                       nfsd4_probe_callback(conf);
-                       status = nfs_ok;
-               }
-       } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
-           && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
-                                                               &confirm)))) {
-               /*
-                * RFC 3530 14.2.34 CASE 4:
-                * Client probably hasn't noticed that we rebooted yet.
-                */
-               status = nfserr_stale_clientid;
-       } else {
-               /* check that we have hit one of the cases...*/
-               status = nfserr_clid_inuse;
+               move_to_confirmed(unconf);
+               nfsd4_probe_callback(unconf);
        }
 out:
        nfs4_unlock_state();
@@ -2454,8 +2413,8 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
        stp->st_file = fp;
        stp->st_access_bmap = 0;
        stp->st_deny_bmap = 0;
-       __set_bit(open->op_share_access, &stp->st_access_bmap);
-       __set_bit(open->op_share_deny, &stp->st_deny_bmap);
+       set_access(open->op_share_access, stp);
+       set_deny(open->op_share_deny, stp);
        stp->st_openstp = NULL;
 }
 
@@ -2534,8 +2493,8 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
        ret = nfserr_locked;
        /* Search for conflicting share reservations */
        list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
-               if (test_bit(deny_type, &stp->st_deny_bmap) ||
-                   test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap))
+               if (test_deny(deny_type, stp) ||
+                   test_deny(NFS4_SHARE_DENY_BOTH, stp))
                        goto out;
        }
        ret = nfs_ok;
@@ -2791,7 +2750,7 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
        bool new_access;
        __be32 status;
 
-       new_access = !test_bit(op_share_access, &stp->st_access_bmap);
+       new_access = !test_access(op_share_access, stp);
        if (new_access) {
                status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
                if (status)
@@ -2806,8 +2765,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
                return status;
        }
        /* remember the open */
-       __set_bit(op_share_access, &stp->st_access_bmap);
-       __set_bit(open->op_share_deny, &stp->st_deny_bmap);
+       set_access(op_share_access, stp);
+       set_deny(open->op_share_deny, stp);
 
        return nfs_ok;
 }
@@ -3282,18 +3241,18 @@ STALE_STATEID(stateid_t *stateid)
 }
 
 static inline int
-access_permit_read(unsigned long access_bmap)
+access_permit_read(struct nfs4_ol_stateid *stp)
 {
-       return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
-               test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap) ||
-               test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap);
+       return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+               test_access(NFS4_SHARE_ACCESS_WRITE, stp);
 }
 
 static inline int
-access_permit_write(unsigned long access_bmap)
+access_permit_write(struct nfs4_ol_stateid *stp)
 {
-       return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
-               test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+       return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp);
 }
 
 static
@@ -3304,9 +3263,9 @@ __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
        /* For lock stateid's, we test the parent open, not the lock: */
        if (stp->st_openstp)
                stp = stp->st_openstp;
-       if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
+       if ((flags & WR_STATE) && !access_permit_write(stp))
                 goto out;
-       if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
+       if ((flags & RD_STATE) && !access_permit_read(stp))
                 goto out;
        status = nfs_ok;
 out:
@@ -3346,7 +3305,7 @@ static bool stateid_generation_after(stateid_t *a, stateid_t *b)
        return (s32)a->si_generation - (s32)b->si_generation > 0;
 }
 
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
+static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
 {
        /*
         * When sessions are used the stateid generation number is ignored
@@ -3655,10 +3614,10 @@ out:
 
 static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
 {
-       if (!test_bit(access, &stp->st_access_bmap))
+       if (!test_access(access, stp))
                return;
        nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
-       __clear_bit(access, &stp->st_access_bmap);
+       clear_access(access, stp);
 }
 
 static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
@@ -3680,12 +3639,12 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
 }
 
 static void
-reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
+reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp)
 {
        int i;
        for (i = 0; i < 4; i++) {
                if ((i & deny) != i)
-                       __clear_bit(i, bmap);
+                       clear_deny(i, stp);
        }
 }
 
@@ -3712,19 +3671,19 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
        if (status)
                goto out; 
        status = nfserr_inval;
-       if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
-               dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
+       if (!test_access(od->od_share_access, stp)) {
+               dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n",
                        stp->st_access_bmap, od->od_share_access);
                goto out;
        }
-       if (!test_bit(od->od_share_deny, &stp->st_deny_bmap)) {
+       if (!test_deny(od->od_share_deny, stp)) {
                dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n",
                        stp->st_deny_bmap, od->od_share_deny);
                goto out;
        }
        nfs4_stateid_downgrade(stp, od->od_share_access);
 
-       reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
+       reset_union_bmap_deny(od->od_share_deny, stp);
 
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -4014,13 +3973,13 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
        struct nfs4_file *fp = lock_stp->st_file;
        int oflag = nfs4_access_to_omode(access);
 
-       if (test_bit(access, &lock_stp->st_access_bmap))
+       if (test_access(access, lock_stp))
                return;
        nfs4_file_get_access(fp, oflag);
-       __set_bit(access, &lock_stp->st_access_bmap);
+       set_access(access, lock_stp);
 }
 
-__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
 {
        struct nfs4_file *fi = ost->st_file;
        struct nfs4_openowner *oo = openowner(ost->st_stateowner);
index 74c00bc92b9af6b01e95e55c119b90d61fbf9d34..4949667c84ea0c3d687a46faf0a455c410c39b6f 100644 (file)
@@ -1674,12 +1674,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 
 static void write32(__be32 **p, u32 n)
 {
-       *(*p)++ = n;
+       *(*p)++ = htonl(n);
 }
 
 static void write64(__be32 **p, u64 n)
 {
-       write32(p, (u32)(n >> 32));
+       write32(p, (n >> 32));
        write32(p, (u32)n);
 }
 
@@ -1744,15 +1744,16 @@ static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, _
 }
 
 /* Encode as an array of strings the string given with components
- * separated @sep.
+ * separated @sep, escaped with esc_enter and esc_exit.
  */
-static __be32 nfsd4_encode_components(char sep, char *components,
-                                  __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_components_esc(char sep, char *components,
+                                  __be32 **pp, int *buflen,
+                                  char esc_enter, char esc_exit)
 {
        __be32 *p = *pp;
        __be32 *countp = p;
        int strlen, count=0;
-       char *str, *end;
+       char *str, *end, *next;
 
        dprintk("nfsd4_encode_components(%s)\n", components);
        if ((*buflen -= 4) < 0)
@@ -1760,8 +1761,23 @@ static __be32 nfsd4_encode_components(char sep, char *components,
        WRITE32(0); /* We will fill this in with @count later */
        end = str = components;
        while (*end) {
-               for (; *end && (*end != sep); end++)
-                       ; /* Point to end of component */
+               bool found_esc = false;
+
+               /* try to parse as esc_start, ..., esc_end, sep */
+               if (*str == esc_enter) {
+                       for (; *end && (*end != esc_exit); end++)
+                               /* find esc_exit or end of string */;
+                       next = end + 1;
+                       if (*end && (!*next || *next == sep)) {
+                               str++;
+                               found_esc = true;
+                       }
+               }
+
+               if (!found_esc)
+                       for (; *end && (*end != sep); end++)
+                               /* find sep or end of string */;
+
                strlen = end - str;
                if (strlen) {
                        if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
@@ -1780,6 +1796,15 @@ static __be32 nfsd4_encode_components(char sep, char *components,
        return 0;
 }
 
+/* Encode as an array of strings the string given with components
+ * separated @sep.
+ */
+static __be32 nfsd4_encode_components(char sep, char *components,
+                                  __be32 **pp, int *buflen)
+{
+       return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0);
+}
+
 /*
  * encode a location element of a fs_locations structure
  */
@@ -1789,7 +1814,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
        __be32 status;
        __be32 *p = *pp;
 
-       status = nfsd4_encode_components(':', location->hosts, &p, buflen);
+       status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen,
+                                               '[', ']');
        if (status)
                return status;
        status = nfsd4_encode_components('/', location->path, &p, buflen);
@@ -3251,7 +3277,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
 }
 
 static __be32
-nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
                         struct nfsd4_exchange_id *exid)
 {
        __be32 *p;
@@ -3306,7 +3332,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
 }
 
 static __be32
-nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
                            struct nfsd4_create_session *sess)
 {
        __be32 *p;
@@ -3355,14 +3381,14 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
 }
 
 static __be32
-nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
                             struct nfsd4_destroy_session *destroy_session)
 {
        return nfserr;
 }
 
 static __be32
-nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
                          struct nfsd4_free_stateid *free_stateid)
 {
        __be32 *p;
@@ -3371,13 +3397,13 @@ nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
                return nfserr;
 
        RESERVE_SPACE(4);
-       WRITE32(nfserr);
+       *p++ = nfserr;
        ADJUST_ARGS();
        return nfserr;
 }
 
 static __be32
-nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
                      struct nfsd4_sequence *seq)
 {
        __be32 *p;
@@ -3399,8 +3425,8 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
        return 0;
 }
 
-__be32
-nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
+static __be32
+nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
                          struct nfsd4_test_stateid *test_stateid)
 {
        struct nfsd4_test_stateid_id *stateid, *next;
@@ -3503,7 +3529,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
  * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
  * will be at least a page and will therefore hold the xdr_buf head.
  */
-int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
 {
        struct xdr_buf *xb = &resp->rqstp->rq_res;
        struct nfsd4_session *session = NULL;
index 72699885ac4892d4faf08bad18c1c09a286d8a8b..c55298ed5772577e5afe3bd613c3e5a0df3b69dd 100644 (file)
@@ -661,6 +661,7 @@ static ssize_t __write_ports_addfd(char *buf)
 {
        char *mesg = buf;
        int fd, err;
+       struct net *net = &init_net;
 
        err = get_int(&mesg, &fd);
        if (err != 0 || fd < 0)
@@ -672,6 +673,8 @@ static ssize_t __write_ports_addfd(char *buf)
 
        err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
        if (err < 0) {
+               if (nfsd_serv->sv_nrthreads == 1)
+                       svc_shutdown_net(nfsd_serv, net);
                svc_destroy(nfsd_serv);
                return err;
        }
@@ -709,6 +712,7 @@ static ssize_t __write_ports_addxprt(char *buf)
        char transport[16];
        struct svc_xprt *xprt;
        int port, err;
+       struct net *net = &init_net;
 
        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
                return -EINVAL;
@@ -720,12 +724,12 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (err != 0)
                return err;
 
-       err = svc_create_xprt(nfsd_serv, transport, &init_net,
+       err = svc_create_xprt(nfsd_serv, transport, net,
                                PF_INET, port, SVC_SOCK_ANONYMOUS);
        if (err < 0)
                goto out_err;
 
-       err = svc_create_xprt(nfsd_serv, transport, &init_net,
+       err = svc_create_xprt(nfsd_serv, transport, net,
                                PF_INET6, port, SVC_SOCK_ANONYMOUS);
        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_close;
@@ -734,12 +738,14 @@ static ssize_t __write_ports_addxprt(char *buf)
        nfsd_serv->sv_nrthreads--;
        return 0;
 out_close:
-       xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port);
+       xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port);
        if (xprt != NULL) {
                svc_close_xprt(xprt);
                svc_xprt_put(xprt);
        }
 out_err:
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
        svc_destroy(nfsd_serv);
        return err;
 }
index cb4d51d8cbdb3818cae8a4404d3e65b0aca84799..ee709fc8f58bc0b62a3f7ca64104630fe803b6d0 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/module.h>
 #include <linux/fs_struct.h>
 #include <linux/swap.h>
+#include <linux/nsproxy.h>
 
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svcsock.h>
@@ -330,6 +331,8 @@ static int nfsd_get_default_max_blksize(void)
 
 int nfsd_create_serv(void)
 {
+       int error;
+
        WARN_ON(!mutex_is_locked(&nfsd_mutex));
        if (nfsd_serv) {
                svc_get(nfsd_serv);
@@ -343,6 +346,12 @@ int nfsd_create_serv(void)
        if (nfsd_serv == NULL)
                return -ENOMEM;
 
+       error = svc_bind(nfsd_serv, current->nsproxy->net_ns);
+       if (error < 0) {
+               svc_destroy(nfsd_serv);
+               return error;
+       }
+
        set_max_drc();
        do_gettimeofday(&nfssvc_boot);          /* record boot time */
        return 0;
@@ -373,6 +382,7 @@ int nfsd_set_nrthreads(int n, int *nthreads)
        int i = 0;
        int tot = 0;
        int err = 0;
+       struct net *net = &init_net;
 
        WARN_ON(!mutex_is_locked(&nfsd_mutex));
 
@@ -417,6 +427,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
                if (err)
                        break;
        }
+
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
        svc_destroy(nfsd_serv);
 
        return err;
@@ -432,6 +445,7 @@ nfsd_svc(unsigned short port, int nrservs)
 {
        int     error;
        bool    nfsd_up_before;
+       struct net *net = &init_net;
 
        mutex_lock(&nfsd_mutex);
        dprintk("nfsd: creating service\n");
@@ -464,6 +478,8 @@ out_shutdown:
        if (error < 0 && !nfsd_up_before)
                nfsd_shutdown();
 out_destroy:
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
        svc_destroy(nfsd_serv);         /* Release server */
 out:
        mutex_unlock(&nfsd_mutex);
@@ -547,6 +563,9 @@ nfsd(void *vrqstp)
        nfsdstats.th_cnt --;
 
 out:
+       if (rqstp->rq_server->sv_nrthreads == 1)
+               svc_shutdown_net(rqstp->rq_server, &init_net);
+
        /* Release the thread */
        svc_exit_thread(rqstp);
 
@@ -659,8 +678,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
 int nfsd_pool_stats_release(struct inode *inode, struct file *file)
 {
        int ret = seq_release(inode, file);
+       struct net *net = &init_net;
+
        mutex_lock(&nfsd_mutex);
        /* this function really, really should have been called svc_put() */
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
        svc_destroy(nfsd_serv);
        mutex_unlock(&nfsd_mutex);
        return ret;
index 89ab137d379a3f6756b5b5616083862e8f22d88f..849091e16ea6afd43e4ddd2dbd17962fdd87ad85 100644 (file)
@@ -232,7 +232,6 @@ struct nfs4_client {
        time_t                  cl_time;        /* time of last lease renewal */
        struct sockaddr_storage cl_addr;        /* client ipaddress */
        u32                     cl_flavor;      /* setclientid pseudoflavor */
-       char                    *cl_principal;  /* setclientid principal name */
        struct svc_cred         cl_cred;        /* setclientid principal */
        clientid_t              cl_clientid;    /* generated by server */
        nfs4_verifier           cl_confirm;     /* generated by server */
index 1b3501598ab5dbb4609ba19e4f7c3322b29f70ba..acd127d4ee821660e71fe1e38ef1c804962f6508 100644 (file)
@@ -60,7 +60,7 @@ struct nfsd4_compound_state {
        __be32                  *datap;
        size_t                  iovlen;
        u32                     minorversion;
-       u32                     status;
+       __be32                  status;
        stateid_t       current_stateid;
        stateid_t       save_stateid;
        /* to indicate current and saved state id presents */
@@ -364,7 +364,7 @@ struct nfsd4_test_stateid_id {
 };
 
 struct nfsd4_test_stateid {
-       __be32          ts_num_ids;
+       u32             ts_num_ids;
        struct list_head ts_stateid_list;
 };
 
@@ -549,7 +549,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
                struct nfsd4_compoundargs *);
 int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
                struct nfsd4_compoundres *);
-int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
 void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
 void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
 __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
index 0bb2c2010b9512ba5fd971fdbdc34eba2abab886..b72847988b78d96d99b7571d17fea769e463c6b0 100644 (file)
@@ -508,31 +508,29 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
        return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
 }
 
-static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
-                          int connectable)
+static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+                          struct inode *parent)
 {
        struct nilfs_fid *fid = (struct nilfs_fid *)fh;
-       struct inode *inode = dentry->d_inode;
        struct nilfs_root *root = NILFS_I(inode)->i_root;
        int type;
 
-       if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
-           (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+       if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {
+               *lenp = NILFS_FID_SIZE_CONNECTABLE;
+               return 255;
+       }
+       if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {
+               *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
                return 255;
+       }
 
        fid->cno = root->cno;
        fid->ino = inode->i_ino;
        fid->gen = inode->i_generation;
 
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                fid->parent_ino = parent->i_ino;
                fid->parent_gen = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
-
                type = FILEID_NILFS_WITH_PARENT;
                *lenp = NILFS_FID_SIZE_CONNECTABLE;
        } else {
index ccb14d3fc0de99790d282ae17ce984338ca309ab..b39c5c161adb64bff0d33faa64f41d8f4a9942cd 100644 (file)
@@ -123,7 +123,7 @@ int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_parent);
 
-static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
+static int send_to_group(struct inode *to_tell,
                         struct fsnotify_mark *inode_mark,
                         struct fsnotify_mark *vfsmount_mark,
                         __u32 mask, void *data,
@@ -168,10 +168,10 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
                        vfsmount_test_mask &= ~inode_mark->ignored_mask;
        }
 
-       pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+       pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
                 " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
                 " data=%p data_is=%d cookie=%d event=%p\n",
-                __func__, group, to_tell, mnt, mask, inode_mark,
+                __func__, group, to_tell, mask, inode_mark,
                 inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
                 data_is, cookie, *event);
 
@@ -258,16 +258,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 
                if (inode_group > vfsmount_group) {
                        /* handle inode */
-                       ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+                       ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
                                            data_is, cookie, file_name, &event);
                        /* we didn't use the vfsmount_mark */
                        vfsmount_group = NULL;
                } else if (vfsmount_group > inode_group) {
-                       ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data,
+                       ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
                                            data_is, cookie, file_name, &event);
                        inode_group = NULL;
                } else {
-                       ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark,
+                       ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
                                            mask, data, data_is, cookie, file_name,
                                            &event);
                }
index 8639169221c7aed21c0bd600ab4ef1a0d8102cb1..7389d2d5e51d257c72f9fb0c1468c38a28b309e4 100644 (file)
@@ -2096,7 +2096,9 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
        err = file_remove_suid(file);
        if (err)
                goto out;
-       file_update_time(file);
+       err = file_update_time(file);
+       if (err)
+               goto out;
        written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
                        count);
 out:
index c7ee03c22226253d970cce94beb11f6353b3e1d0..0725e605465040b6b1e7c5e7744c5243968158c9 100644 (file)
@@ -422,45 +422,46 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
                               struct ocfs2_blockcheck_stats *stats)
 {
        int rc = 0;
-       struct ocfs2_block_check check;
+       u32 bc_crc32e;
+       u16 bc_ecc;
        u32 crc, ecc;
 
        ocfs2_blockcheck_inc_check(stats);
 
-       check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
-       check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+       bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+       bc_ecc = le16_to_cpu(bc->bc_ecc);
 
        memset(bc, 0, sizeof(struct ocfs2_block_check));
 
        /* Fast path - if the crc32 validates, we're good to go */
        crc = crc32_le(~0, data, blocksize);
-       if (crc == check.bc_crc32e)
+       if (crc == bc_crc32e)
                goto out;
 
        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
             "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
 
        /* Ok, try ECC fixups */
        ecc = ocfs2_hamming_encode_block(data, blocksize);
-       ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+       ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
 
        /* And check the crc32 again */
        crc = crc32_le(~0, data, blocksize);
-       if (crc == check.bc_crc32e) {
+       if (crc == bc_crc32e) {
                ocfs2_blockcheck_inc_recover(stats);
                goto out;
        }
 
        mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
 
        rc = -EIO;
 
 out:
-       bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
-       bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+       bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+       bc->bc_ecc = cpu_to_le16(bc_ecc);
 
        return rc;
 }
@@ -528,7 +529,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
                                   struct ocfs2_blockcheck_stats *stats)
 {
        int i, rc = 0;
-       struct ocfs2_block_check check;
+       u32 bc_crc32e;
+       u16 bc_ecc;
        u32 crc, ecc, fix;
 
        BUG_ON(nr < 0);
@@ -538,21 +540,21 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
 
        ocfs2_blockcheck_inc_check(stats);
 
-       check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
-       check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+       bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+       bc_ecc = le16_to_cpu(bc->bc_ecc);
 
        memset(bc, 0, sizeof(struct ocfs2_block_check));
 
        /* Fast path - if the crc32 validates, we're good to go */
        for (i = 0, crc = ~0; i < nr; i++)
                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
-       if (crc == check.bc_crc32e)
+       if (crc == bc_crc32e)
                goto out;
 
        ocfs2_blockcheck_inc_failure(stats);
        mlog(ML_ERROR,
             "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
 
        /* Ok, try ECC fixups */
        for (i = 0, ecc = 0; i < nr; i++) {
@@ -565,7 +567,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
                                                bhs[i]->b_size * 8,
                                                bhs[i]->b_size * 8 * i);
        }
-       fix = ecc ^ check.bc_ecc;
+       fix = ecc ^ bc_ecc;
        for (i = 0; i < nr; i++) {
                /*
                 * Try the fix against each buffer.  It will only affect
@@ -578,19 +580,19 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
        /* And check the crc32 again */
        for (i = 0, crc = ~0; i < nr; i++)
                crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
-       if (crc == check.bc_crc32e) {
+       if (crc == bc_crc32e) {
                ocfs2_blockcheck_inc_recover(stats);
                goto out;
        }
 
        mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
 
        rc = -EIO;
 
 out:
-       bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
-       bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+       bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+       bc->bc_ecc = cpu_to_le16(bc_ecc);
 
        return rc;
 }
index 3a3ed4bb794b0d6c75e7e321b042b1b4128fbd27..fbec0be6232622ddda0c3ed4ed49c50cc0129386 100644 (file)
@@ -293,7 +293,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
        struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
        char *name;
        struct list_head *iter, *head=NULL;
-       u64 cookie;
+       __be64 cookie;
        u32 flags;
        u8 node;
 
index a5952ceecba5a83147389ad4a1cd24972ee0bfbe..de854cca12a2d23dea5652d3dad38461c7dbde13 100644 (file)
@@ -679,7 +679,7 @@ struct dlm_query_join_packet {
 };
 
 union dlm_query_join_response {
-       u32 intval;
+       __be32 intval;
        struct dlm_query_join_packet packet;
 };
 
@@ -755,8 +755,8 @@ struct dlm_query_region {
 struct dlm_node_info {
        u8 ni_nodenum;
        u8 pad1;
-       u16 ni_ipv4_port;
-       u32 ni_ipv4_address;
+       __be16 ni_ipv4_port;
+       __be32 ni_ipv4_address;
 };
 
 struct dlm_query_nodeinfo {
index 92f2ead0fab6de22fa138cc4410dee6e1544216c..9e89d70df337fc98836e87f90e38887a716843e6 100644 (file)
@@ -818,7 +818,7 @@ static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
        union dlm_query_join_response response;
 
        response.packet = *packet;
-       *wire = cpu_to_be32(response.intval);
+       *wire = be32_to_cpu(response.intval);
 }
 
 static void dlm_query_join_wire_to_packet(u32 wire,
index 745db42528d5fd2f875a099177158f347fd361e7..322216a5f0dd1e0f2e178540781b3c6fd263c985 100644 (file)
@@ -177,21 +177,23 @@ bail:
        return parent;
 }
 
-static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
-                          int connectable)
+static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
+                          struct inode *parent)
 {
-       struct inode *inode = dentry->d_inode;
        int len = *max_len;
        int type = 1;
        u64 blkno;
        u32 generation;
        __le32 *fh = (__force __le32 *) fh_in;
 
+#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
+#error "You go ahead and fix that mess, then.  Somehow"
        trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
                                    dentry->d_name.name,
                                    fh, len, connectable);
+#endif
 
-       if (connectable && (len < 6)) {
+       if (parent && (len < 6)) {
                *max_len = 6;
                type = 255;
                goto bail;
@@ -211,12 +213,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
        fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
        fh[2] = cpu_to_le32(generation);
 
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                blkno = OCFS2_I(parent)->ip_blkno;
                generation = parent->i_generation;
 
@@ -224,8 +221,6 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
                fh[5] = cpu_to_le32(generation);
 
-               spin_unlock(&dentry->d_lock);
-
                len = 6;
                type = 2;
 
index 735514ca400f7942268dff8f387c7c029506b859..d89e08a81eda8875fcd59d76c5e1d75827e7d7ac 100644 (file)
@@ -273,11 +273,13 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
        inode->i_gid = le32_to_cpu(fe->i_gid);
 
        /* Fast symlinks will have i_size but no allocated clusters. */
-       if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+       if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
                inode->i_blocks = 0;
-       else
+               inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+       } else {
                inode->i_blocks = ocfs2_inode_sector_count(inode);
-       inode->i_mapping->a_ops = &ocfs2_aops;
+               inode->i_mapping->a_ops = &ocfs2_aops;
+       }
        inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
        inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
        inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -331,10 +333,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                    OCFS2_I(inode)->ip_dir_lock_gen = 1;
                    break;
            case S_IFLNK:
-                   if (ocfs2_inode_is_fast_symlink(inode))
-                       inode->i_op = &ocfs2_fast_symlink_inode_operations;
-                   else
-                       inode->i_op = &ocfs2_symlink_inode_operations;
+                   inode->i_op = &ocfs2_symlink_inode_operations;
                    i_size_write(inode, le64_to_cpu(fe->i_size));
                    break;
            default:
index a1a1bfd652c90d49521ad3ea12a908f9a168c1e9..d96f7f81d8dd3257f49bb02885db296af22881cb 100644 (file)
@@ -864,7 +864,7 @@ int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
                if (status)
                        break;
 
-               reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+               reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
                if (!reqp) {
                        status = -EINVAL;
                        goto bail;
@@ -888,9 +888,11 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
        struct ocfs2_space_resv sr;
        struct ocfs2_new_group_input input;
        struct reflink_arguments args;
-       const char *old_path, *new_path;
+       const char __user *old_path;
+       const char __user *new_path;
        bool preserve;
        struct ocfs2_info info;
+       void __user *argp = (void __user *)arg;
 
        switch (cmd) {
        case OCFS2_IOC_GETFLAGS:
@@ -937,17 +939,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
                return ocfs2_group_add(inode, &input);
        case OCFS2_IOC_REFLINK:
-               if (copy_from_user(&args, (struct reflink_arguments *)arg,
-                                  sizeof(args)))
+               if (copy_from_user(&args, argp, sizeof(args)))
                        return -EFAULT;
-               old_path = (const char *)(unsigned long)args.old_path;
-               new_path = (const char *)(unsigned long)args.new_path;
+               old_path = (const char __user *)(unsigned long)args.old_path;
+               new_path = (const char __user *)(unsigned long)args.new_path;
                preserve = (args.preserve != 0);
 
                return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
        case OCFS2_IOC_INFO:
-               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
-                                  sizeof(struct ocfs2_info)))
+               if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
                        return -EFAULT;
 
                return ocfs2_info_handle(inode, &info, 0);
@@ -960,22 +960,20 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
 
-               if (copy_from_user(&range, (struct fstrim_range *)arg,
-                   sizeof(range)))
+               if (copy_from_user(&range, argp, sizeof(range)))
                        return -EFAULT;
 
                ret = ocfs2_trim_fs(sb, &range);
                if (ret < 0)
                        return ret;
 
-               if (copy_to_user((struct fstrim_range *)arg, &range,
-                   sizeof(range)))
+               if (copy_to_user(argp, &range, sizeof(range)))
                        return -EFAULT;
 
                return 0;
        }
        case OCFS2_IOC_MOVE_EXT:
-               return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
+               return ocfs2_ioctl_move_extents(filp, argp);
        default:
                return -ENOTTY;
        }
@@ -988,6 +986,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        struct reflink_arguments args;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct ocfs2_info info;
+       void __user *argp = (void __user *)arg;
 
        switch (cmd) {
        case OCFS2_IOC32_GETFLAGS:
@@ -1006,16 +1005,14 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        case FITRIM:
                break;
        case OCFS2_IOC_REFLINK:
-               if (copy_from_user(&args, (struct reflink_arguments *)arg,
-                                  sizeof(args)))
+               if (copy_from_user(&args, argp, sizeof(args)))
                        return -EFAULT;
                preserve = (args.preserve != 0);
 
                return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
                                           compat_ptr(args.new_path), preserve);
        case OCFS2_IOC_INFO:
-               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
-                                  sizeof(struct ocfs2_info)))
+               if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
                        return -EFAULT;
 
                return ocfs2_info_handle(inode, &info, 1);
index b1e3fce72ea4767bf795c692e98faacebc42797c..6083432f667e3077eb466842ef0f00136d0b4b6f 100644 (file)
@@ -1082,8 +1082,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
        context->file = filp;
 
        if (argp) {
-               if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
-                                  sizeof(range))) {
+               if (copy_from_user(&range, argp, sizeof(range))) {
                        status = -EFAULT;
                        goto out;
                }
@@ -1138,8 +1137,7 @@ out:
         * length and new_offset even if failure happens somewhere.
         */
        if (argp) {
-               if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
-                               sizeof(range)))
+               if (copy_to_user(argp, &range, sizeof(range)))
                        status = -EFAULT;
        }
 
index a9856e3eaaf09753b4921d56ccdfb172db5cad7b..9f39c640cddf2076b951295dde5ef68217b26452 100644 (file)
@@ -1724,15 +1724,16 @@ static int ocfs2_symlink(struct inode *dir,
        fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
        inode->i_rdev = 0;
        newsize = l - 1;
+       inode->i_op = &ocfs2_symlink_inode_operations;
        if (l > ocfs2_fast_symlink_chars(sb)) {
                u32 offset = 0;
 
-               inode->i_op = &ocfs2_symlink_inode_operations;
                status = dquot_alloc_space_nodirty(inode,
                    ocfs2_clusters_to_bytes(osb->sb, 1));
                if (status)
                        goto bail;
                did_quota = 1;
+               inode->i_mapping->a_ops = &ocfs2_aops;
                status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
                                              new_fe_bh,
                                              handle, data_ac, NULL,
@@ -1750,7 +1751,7 @@ static int ocfs2_symlink(struct inode *dir,
                i_size_write(inode, newsize);
                inode->i_blocks = ocfs2_inode_sector_count(inode);
        } else {
-               inode->i_op = &ocfs2_fast_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
                memcpy((char *) fe->id2.i_symlink, symname, l);
                i_size_write(inode, newsize);
                inode->i_blocks = 0;
index 5d22872e2bb36012b711ac7720a90bee24717b15..f1fbb4b552ad3649238becdd9c21d4b138d5c8d7 100644 (file)
 #include "buffer_head_io.h"
 
 
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
-                                       struct buffer_head **bh)
+static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
 {
-       int status;
-       char *link = NULL;
+       struct inode *inode = page->mapping->host;
+       struct buffer_head *bh;
+       int status = ocfs2_read_inode_block(inode, &bh);
        struct ocfs2_dinode *fe;
+       const char *link;
+       void *kaddr;
+       size_t len;
 
-       status = ocfs2_read_inode_block(inode, bh);
        if (status < 0) {
                mlog_errno(status);
-               link = ERR_PTR(status);
-               goto bail;
+               return status;
        }
 
-       fe = (struct ocfs2_dinode *) (*bh)->b_data;
+       fe = (struct ocfs2_dinode *) bh->b_data;
        link = (char *) fe->id2.i_symlink;
-bail:
-
-       return link;
-}
-
-static int ocfs2_readlink(struct dentry *dentry,
-                         char __user *buffer,
-                         int buflen)
-{
-       int ret;
-       char *link;
-       struct buffer_head *bh = NULL;
-       struct inode *inode = dentry->d_inode;
-
-       link = ocfs2_fast_symlink_getlink(inode, &bh);
-       if (IS_ERR(link)) {
-               ret = PTR_ERR(link);
-               goto out;
-       }
-
-       /*
-        * Without vfsmount we can't update atime now,
-        * but we will update atime here ultimately.
-        */
-       ret = vfs_readlink(dentry, buffer, buflen, link);
-
+       /* will be less than a page size */
+       len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
+       kaddr = kmap_atomic(page);
+       memcpy(kaddr, link, len + 1);
+       kunmap_atomic(kaddr);
+       SetPageUptodate(page);
+       unlock_page(page);
        brelse(bh);
-out:
-       if (ret < 0)
-               mlog_errno(ret);
-       return ret;
+       return 0;
 }
 
-static void *ocfs2_fast_follow_link(struct dentry *dentry,
-                                   struct nameidata *nd)
-{
-       int status = 0;
-       int len;
-       char *target, *link = ERR_PTR(-ENOMEM);
-       struct inode *inode = dentry->d_inode;
-       struct buffer_head *bh = NULL;
-
-       BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
-       target = ocfs2_fast_symlink_getlink(inode, &bh);
-       if (IS_ERR(target)) {
-               status = PTR_ERR(target);
-               mlog_errno(status);
-               goto bail;
-       }
-
-       /* Fast symlinks can't be large */
-       len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
-       link = kzalloc(len + 1, GFP_NOFS);
-       if (!link) {
-               status = -ENOMEM;
-               mlog_errno(status);
-               goto bail;
-       }
-
-       memcpy(link, target, len);
-
-bail:
-       nd_set_link(nd, status ? ERR_PTR(status) : link);
-       brelse(bh);
-
-       if (status)
-               mlog_errno(status);
-       return NULL;
-}
-
-static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
-       char *link = nd_get_link(nd);
-       if (!IS_ERR(link))
-               kfree(link);
-}
+const struct address_space_operations ocfs2_fast_symlink_aops = {
+       .readpage               = ocfs2_fast_symlink_readpage,
+};
 
 const struct inode_operations ocfs2_symlink_inode_operations = {
-       .readlink       = page_readlink,
+       .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
        .getattr        = ocfs2_getattr,
@@ -159,15 +98,3 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
        .removexattr    = generic_removexattr,
        .fiemap         = ocfs2_fiemap,
 };
-const struct inode_operations ocfs2_fast_symlink_inode_operations = {
-       .readlink       = ocfs2_readlink,
-       .follow_link    = ocfs2_fast_follow_link,
-       .put_link       = ocfs2_fast_put_link,
-       .getattr        = ocfs2_getattr,
-       .setattr        = ocfs2_setattr,
-       .setxattr       = generic_setxattr,
-       .getxattr       = generic_getxattr,
-       .listxattr      = ocfs2_listxattr,
-       .removexattr    = generic_removexattr,
-       .fiemap         = ocfs2_fiemap,
-};
index 65a6c9c6ad51d1018147cff4685743dd22bae935..71ee4245e9192274552ef9492412b36b068e72d6 100644 (file)
@@ -27,7 +27,7 @@
 #define OCFS2_SYMLINK_H
 
 extern const struct inode_operations ocfs2_symlink_inode_operations;
-extern const struct inode_operations ocfs2_fast_symlink_inode_operations;
+extern const struct address_space_operations ocfs2_fast_symlink_aops;
 
 /*
  * Test whether an inode is a fast symlink.
index d54301219d04f1c8fed18d6de15ed590a593e2ab..d6c79a0dffc7b0827b09562e11fa0f610af5657d 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -654,10 +654,23 @@ static inline int __get_file_write_access(struct inode *inode,
        return error;
 }
 
-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
-                                       struct file *f,
-                                       int (*open)(struct inode *, struct file *),
-                                       const struct cred *cred)
+int open_check_o_direct(struct file *f)
+{
+       /* NB: we're sure to have correct a_ops only after f_op->open */
+       if (f->f_flags & O_DIRECT) {
+               if (!f->f_mapping->a_ops ||
+                   ((!f->f_mapping->a_ops->direct_IO) &&
+                   (!f->f_mapping->a_ops->get_xip_mem))) {
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+                                  struct file *f,
+                                  int (*open)(struct inode *, struct file *),
+                                  const struct cred *cred)
 {
        static const struct file_operations empty_fops = {};
        struct inode *inode;
@@ -713,16 +726,6 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
 
        file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
 
-       /* NB: we're sure to have correct a_ops only after f_op->open */
-       if (f->f_flags & O_DIRECT) {
-               if (!f->f_mapping->a_ops ||
-                   ((!f->f_mapping->a_ops->direct_IO) &&
-                   (!f->f_mapping->a_ops->get_xip_mem))) {
-                       fput(f);
-                       f = ERR_PTR(-EINVAL);
-               }
-       }
-
        return f;
 
 cleanup_all:
@@ -744,12 +747,29 @@ cleanup_all:
        f->f_path.dentry = NULL;
        f->f_path.mnt = NULL;
 cleanup_file:
-       put_filp(f);
        dput(dentry);
        mntput(mnt);
        return ERR_PTR(error);
 }
 
+static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+                               struct file *f,
+                               int (*open)(struct inode *, struct file *),
+                               const struct cred *cred)
+{
+       struct file *res = do_dentry_open(dentry, mnt, f, open, cred);
+       if (!IS_ERR(res)) {
+               int error = open_check_o_direct(f);
+               if (error) {
+                       fput(res);
+                       res = ERR_PTR(error);
+               }
+       } else {
+               put_filp(f);
+       }
+       return res;
+}
+
 /**
  * lookup_instantiate_filp - instantiates the open intent filp
  * @nd: pointer to nameidata
@@ -804,13 +824,31 @@ struct file *nameidata_to_filp(struct nameidata *nd)
 
        /* Pick up the filp from the open intent */
        filp = nd->intent.open.file;
-       nd->intent.open.file = NULL;
 
        /* Has the filesystem initialised the file for us? */
-       if (filp->f_path.dentry == NULL) {
+       if (filp->f_path.dentry != NULL) {
+               nd->intent.open.file = NULL;
+       } else {
+               struct file *res;
+
                path_get(&nd->path);
-               filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
-                                    NULL, cred);
+               res = do_dentry_open(nd->path.dentry, nd->path.mnt,
+                                    filp, NULL, cred);
+               if (!IS_ERR(res)) {
+                       int error;
+
+                       nd->intent.open.file = NULL;
+                       BUG_ON(res != filp);
+
+                       error = open_check_o_direct(filp);
+                       if (error) {
+                               fput(filp);
+                               filp = ERR_PTR(error);
+                       }
+               } else {
+                       /* Allow nd->intent.open.file to be recycled */
+                       filp = res;
+               }
        }
        return filp;
 }
index 95ebb56de494de44efb6224ccfee1b96e4169267..49c1065256fd10d9d5fdca3cf449b1e56bd58a0a 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -654,8 +654,11 @@ out:
                wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
        }
-       if (ret > 0)
-               file_update_time(filp);
+       if (ret > 0) {
+               int err = file_update_time(filp);
+               if (err)
+                       ret = err;
+       }
        return ret;
 }
 
index ab5fa9e1a79ac8277ac1cb51db6a92e55ba2c935..bed378db075813350362c39f423d1b4335240bfa 100644 (file)
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
                prev_src_mnt  = child;
        }
 out:
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
        while (!list_empty(&tmp_list)) {
                child = list_first_entry(&tmp_list, struct mount, mnt_hash);
                umount_tree(child, 0, &umount_list);
        }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
        release_mounts(&umount_list);
        return ret;
 }
index 12412852d88a94d574bacebb5e64200f202db852..5e289a7cbad17d8547458f1d8b2526f2e85e5cb9 100644 (file)
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
 
        poll_wait(file, &p->ns->poll, wait);
 
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
        if (p->m.poll_event != ns->event) {
                p->m.poll_event = ns->event;
                res |= POLLERR | POLLPRI;
        }
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
 
        return res;
 }
index cc0a8227cddf688f70e289c427666057ce98e613..39e3370d79cf1e6399843137e2d64165baf49a03 100644 (file)
@@ -108,11 +108,11 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
        int error;
        struct file * file;
        struct readdir_callback buf;
+       int fput_needed;
 
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
        if (!file)
-               goto out;
+               return -EBADF;
 
        buf.result = 0;
        buf.dirent = dirent;
@@ -121,8 +121,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
        if (buf.result)
                error = buf.result;
 
-       fput(file);
-out:
+       fput_light(file, fput_needed);
        return error;
 }
 
@@ -195,16 +194,15 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
        struct file * file;
        struct linux_dirent __user * lastdirent;
        struct getdents_callback buf;
+       int fput_needed;
        int error;
 
-       error = -EFAULT;
        if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
 
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
        if (!file)
-               goto out;
+               return -EBADF;
 
        buf.current_dir = dirent;
        buf.previous = NULL;
@@ -221,8 +219,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
                else
                        error = count - buf.count;
        }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
        return error;
 }
 
@@ -278,16 +275,15 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
        struct file * file;
        struct linux_dirent64 __user * lastdirent;
        struct getdents_callback64 buf;
+       int fput_needed;
        int error;
 
-       error = -EFAULT;
        if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
 
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
        if (!file)
-               goto out;
+               return -EBADF;
 
        buf.current_dir = dirent;
        buf.previous = NULL;
@@ -305,7 +301,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
                else
                        error = count - buf.count;
        }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
        return error;
 }
index 59d06871a850dcebc966581f43c656bedba440f0..a6d4268fb6c11798db5f8339bd14ab297cd9b21f 100644 (file)
@@ -1592,13 +1592,12 @@ struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
                (fh_type == 6) ? fid->raw[5] : 0);
 }
 
-int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
-                      int need_parent)
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+                      struct inode *parent)
 {
-       struct inode *inode = dentry->d_inode;
        int maxlen = *lenp;
 
-       if (need_parent && (maxlen < 5)) {
+       if (parent && (maxlen < 5)) {
                *lenp = 5;
                return 255;
        } else if (maxlen < 3) {
@@ -1610,20 +1609,15 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
        data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
        data[2] = inode->i_generation;
        *lenp = 3;
-       /* no room for directory info? return what we've stored so far */
-       if (maxlen < 5 || !need_parent)
-               return 3;
-
-       spin_lock(&dentry->d_lock);
-       inode = dentry->d_parent->d_inode;
-       data[3] = inode->i_ino;
-       data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-       *lenp = 5;
-       if (maxlen >= 6) {
-               data[5] = inode->i_generation;
-               *lenp = 6;
-       }
-       spin_unlock(&dentry->d_lock);
+       if (parent) {
+               data[3] = parent->i_ino;
+               data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
+               *lenp = 5;
+               if (maxlen >= 6) {
+                       data[5] = parent->i_generation;
+                       *lenp = 6;
+               }
+       }
        return *lenp;
 }
 
index b1a08573fe14277961aa3039ce0fb587d4f889cc..afcadcc03e8ac87c7f25f3e2393b3c108daaf91d 100644 (file)
@@ -1923,6 +1923,8 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
         * the workqueue job (flush_async_commit) needs this lock
         */
        reiserfs_write_unlock(sb);
+
+       cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
        flush_workqueue(commit_wq);
 
        if (!reiserfs_mounted_fs_count) {
@@ -3231,8 +3233,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
                               th->t_trans_id, journal->j_trans_id);
        }
 
-       sb->s_dirt = 1;
-
        prepared = test_clear_buffer_journal_prepared(bh);
        clear_buffer_journal_restore_dirty(bh);
        /* already in this transaction, we are done */
@@ -3316,6 +3316,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
                journal->j_first = cn;
                journal->j_last = cn;
        }
+       reiserfs_schedule_old_flush(sb);
        return 0;
 }
 
@@ -3492,7 +3493,7 @@ static void flush_async_commits(struct work_struct *work)
 ** flushes any old transactions to disk
 ** ends the current transaction if it is too old
 */
-int reiserfs_flush_old_commits(struct super_block *sb)
+void reiserfs_flush_old_commits(struct super_block *sb)
 {
        time_t now;
        struct reiserfs_transaction_handle th;
@@ -3502,9 +3503,8 @@ int reiserfs_flush_old_commits(struct super_block *sb)
        /* safety check so we don't flush while we are replaying the log during
         * mount
         */
-       if (list_empty(&journal->j_journal_list)) {
-               return 0;
-       }
+       if (list_empty(&journal->j_journal_list))
+               return;
 
        /* check the current transaction.  If there are no writers, and it is
         * too old, finish it, and force the commit blocks to disk
@@ -3526,7 +3526,6 @@ int reiserfs_flush_old_commits(struct super_block *sb)
                        do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
                }
        }
-       return sb->s_dirt;
 }
 
 /*
@@ -3955,7 +3954,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
         ** it tells us if we should continue with the journal_end, or just return
         */
        if (!check_journal_end(th, sb, nblocks, flags)) {
-               sb->s_dirt = 1;
+               reiserfs_schedule_old_flush(sb);
                wake_queued_writers(sb);
                reiserfs_async_progress_wait(sb);
                goto out;
index a59d27126338e43939f8fc04942acf13c956f1e4..33215f57ea06ce3026ef2d488832a337a361bd71 100644 (file)
@@ -480,6 +480,11 @@ struct reiserfs_sb_info {
        struct dentry *priv_root;       /* root of /.reiserfs_priv */
        struct dentry *xattr_root;      /* root of /.reiserfs_priv/xattrs */
        int j_errno;
+
+       int work_queued;              /* non-zero delayed work is queued */
+       struct delayed_work old_work; /* old transactions flush delayed work */
+       spinlock_t old_work_lock;     /* protects old_work and work_queued */
+
 #ifdef CONFIG_QUOTA
        char *s_qf_names[MAXQUOTAS];
        int s_jquota_fmt;
@@ -2452,7 +2457,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
 int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
 int reiserfs_commit_page(struct inode *inode, struct page *page,
                         unsigned from, unsigned to);
-int reiserfs_flush_old_commits(struct super_block *);
+void reiserfs_flush_old_commits(struct super_block *);
 int reiserfs_commit_for_inode(struct inode *);
 int reiserfs_inode_needs_commit(struct inode *);
 void reiserfs_update_inode_transaction(struct inode *);
@@ -2487,6 +2492,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
 int reiserfs_allocate_list_bitmaps(struct super_block *s,
                                   struct reiserfs_list_bitmap *, unsigned int);
 
+void reiserfs_schedule_old_flush(struct super_block *s);
 void add_save_link(struct reiserfs_transaction_handle *th,
                   struct inode *inode, int truncate);
 int remove_save_link(struct inode *inode, int truncate);
@@ -2611,8 +2617,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                     int fh_len, int fh_type);
 struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
                                     int fh_len, int fh_type);
-int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
-                      int connectable);
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+                      struct inode *parent);
 
 int reiserfs_truncate_file(struct inode *, int update_timestamps);
 void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
index 9a17f63c3fd7f3618a44bdf946476e4957dcd50d..3ce02cff5e90bd1c26374e12e15a6f56ea8c8803 100644 (file)
@@ -200,7 +200,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                                          (bmap_nr_new - bmap_nr)));
        PUT_SB_BLOCK_COUNT(s, block_count_new);
        PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
-       s->s_dirt = 1;
 
        journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
 
index c07b7d709447de1670e9caf8a7fe9bbca6593138..651ce767b55d8241e283b3001d7fc9e6d803b317 100644 (file)
@@ -72,20 +72,58 @@ static int reiserfs_sync_fs(struct super_block *s, int wait)
        if (!journal_begin(&th, s, 1))
                if (!journal_end_sync(&th, s, 1))
                        reiserfs_flush_old_commits(s);
-       s->s_dirt = 0;  /* Even if it's not true.
-                        * We'll loop forever in sync_supers otherwise */
        reiserfs_write_unlock(s);
        return 0;
 }
 
-static void reiserfs_write_super(struct super_block *s)
+static void flush_old_commits(struct work_struct *work)
 {
+       struct reiserfs_sb_info *sbi;
+       struct super_block *s;
+
+       sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
+       s = sbi->s_journal->j_work_sb;
+
+       spin_lock(&sbi->old_work_lock);
+       sbi->work_queued = 0;
+       spin_unlock(&sbi->old_work_lock);
+
        reiserfs_sync_fs(s, 1);
 }
 
+void reiserfs_schedule_old_flush(struct super_block *s)
+{
+       struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+       unsigned long delay;
+
+       if (s->s_flags & MS_RDONLY)
+               return;
+
+       spin_lock(&sbi->old_work_lock);
+       if (!sbi->work_queued) {
+               delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+               queue_delayed_work(system_long_wq, &sbi->old_work, delay);
+               sbi->work_queued = 1;
+       }
+       spin_unlock(&sbi->old_work_lock);
+}
+
+static void cancel_old_flush(struct super_block *s)
+{
+       struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+
+       cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+       spin_lock(&sbi->old_work_lock);
+       sbi->work_queued = 0;
+       spin_unlock(&sbi->old_work_lock);
+}
+
 static int reiserfs_freeze(struct super_block *s)
 {
        struct reiserfs_transaction_handle th;
+
+       cancel_old_flush(s);
+
        reiserfs_write_lock(s);
        if (!(s->s_flags & MS_RDONLY)) {
                int err = journal_begin(&th, s, 1);
@@ -99,7 +137,6 @@ static int reiserfs_freeze(struct super_block *s)
                        journal_end_sync(&th, s, 1);
                }
        }
-       s->s_dirt = 0;
        reiserfs_write_unlock(s);
        return 0;
 }
@@ -483,9 +520,6 @@ static void reiserfs_put_super(struct super_block *s)
 
        reiserfs_write_lock(s);
 
-       if (s->s_dirt)
-               reiserfs_write_super(s);
-
        /* change file system state to current state if it was mounted with read-write permissions */
        if (!(s->s_flags & MS_RDONLY)) {
                if (!journal_begin(&th, s, 10)) {
@@ -692,7 +726,6 @@ static const struct super_operations reiserfs_sops = {
        .dirty_inode = reiserfs_dirty_inode,
        .evict_inode = reiserfs_evict_inode,
        .put_super = reiserfs_put_super,
-       .write_super = reiserfs_write_super,
        .sync_fs = reiserfs_sync_fs,
        .freeze_fs = reiserfs_freeze,
        .unfreeze_fs = reiserfs_unfreeze,
@@ -1400,7 +1433,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        err = journal_end(&th, s, 10);
        if (err)
                goto out_err;
-       s->s_dirt = 0;
 
        if (!(*mount_flags & MS_RDONLY)) {
                dquot_resume(s, -1);
@@ -1730,19 +1762,21 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                return -ENOMEM;
        s->s_fs_info = sbi;
        /* Set default values for options: non-aggressive tails, RO on errors */
-       REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
-       REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
-       REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
+       sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
+       sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
+       sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
        /* no preallocation minimum, be smart in
           reiserfs_file_write instead */
-       REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
+       sbi->s_alloc_options.preallocmin = 0;
        /* Preallocate by 16 blocks (17-1) at once */
-       REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
+       sbi->s_alloc_options.preallocsize = 17;
        /* setup default block allocator options */
        reiserfs_init_alloc_options(s);
 
-       mutex_init(&REISERFS_SB(s)->lock);
-       REISERFS_SB(s)->lock_depth = -1;
+       spin_lock_init(&sbi->old_work_lock);
+       INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
+       mutex_init(&sbi->lock);
+       sbi->lock_depth = -1;
 
        jdev_name = NULL;
        if (reiserfs_parse_options
@@ -1751,8 +1785,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                goto error_unlocked;
        }
        if (jdev_name && jdev_name[0]) {
-               REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
-               if (!REISERFS_SB(s)->s_jdev) {
+               sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
+               if (!sbi->s_jdev) {
                        SWARN(silent, s, "", "Cannot allocate memory for "
                                "journal device name");
                        goto error;
@@ -1810,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
        /* make data=ordered the default */
        if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
            !reiserfs_data_writeback(s)) {
-               REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+               sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
        }
 
        if (reiserfs_data_log(s)) {
@@ -2003,6 +2037,8 @@ error_unlocked:
                reiserfs_write_unlock(s);
        }
 
+       cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+
        reiserfs_free_bitmap_cache(s);
        if (SB_BUFFER_WITH_SB(s))
                brelse(SB_BUFFER_WITH_SB(s));
index 7ae2a574cb25a64902128f53832b317202dbee8f..9f35a37173de0de1f7fbd8d80ca8ad39b50e3782 100644 (file)
@@ -269,12 +269,13 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
                if (ufd < 0)
                        kfree(ctx);
        } else {
-               struct file *file = fget(ufd);
+               int fput_needed;
+               struct file *file = fget_light(ufd, &fput_needed);
                if (!file)
                        return -EBADF;
                ctx = file->private_data;
                if (file->f_op != &signalfd_fops) {
-                       fput(file);
+                       fput_light(file, fput_needed);
                        return -EINVAL;
                }
                spin_lock_irq(&current->sighand->siglock);
@@ -282,7 +283,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
                spin_unlock_irq(&current->sighand->siglock);
 
                wake_up(&current->sighand->signalfd_wqh);
-               fput(file);
+               fput_light(file, fput_needed);
        }
 
        return ufd;
index 406ef2b792c293d709aa164481d20f7ee37b2ed4..c9f1318a3b820b363526576036c4894205552921 100644 (file)
@@ -1003,8 +1003,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
                ret = file_remove_suid(out);
                if (!ret) {
-                       file_update_time(out);
-                       ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+                       ret = file_update_time(out);
+                       if (!ret)
+                               ret = splice_from_pipe_feed(pipe, &sd,
+                                                           pipe_to_file);
                }
                mutex_unlock(&inode->i_mutex);
        } while (ret > 0);
index 43e6b6fe4e855684a197c48ed6bb8dee70f95467..95ad5c0e586c9f64fe492e141387b5092956d553 100644 (file)
@@ -87,11 +87,12 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)
 
 int fd_statfs(int fd, struct kstatfs *st)
 {
-       struct file *file = fget(fd);
+       int fput_needed;
+       struct file *file = fget_light(fd, &fput_needed);
        int error = -EBADF;
        if (file) {
                error = vfs_statfs(&file->f_path, st);
-               fput(file);
+               fput_light(file, fput_needed);
        }
        return error;
 }
index 0e8db939d96f8fdaa072df7e6fcadb15a5781a84..11e3d1c449018dcf9a95c352746f46d6522c4cb2 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -188,11 +188,12 @@ static int do_fsync(unsigned int fd, int datasync)
 {
        struct file *file;
        int ret = -EBADF;
+       int fput_needed;
 
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
        if (file) {
                ret = vfs_fsync(file, datasync);
-               fput(file);
+               fput_light(file, fput_needed);
        }
        return ret;
 }
index 62a2727f4ecf71809f518dd206922fc7f9234f0d..a6d42efc76d227d62289f852982160442d7e5cea 100644 (file)
@@ -1127,16 +1127,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        struct ubifs_inode *ui = ubifs_inode(inode);
 
        mutex_lock(&ui->ui_mutex);
-       stat->dev = inode->i_sb->s_dev;
-       stat->ino = inode->i_ino;
-       stat->mode = inode->i_mode;
-       stat->nlink = inode->i_nlink;
-       stat->uid = inode->i_uid;
-       stat->gid = inode->i_gid;
-       stat->rdev = inode->i_rdev;
-       stat->atime = inode->i_atime;
-       stat->mtime = inode->i_mtime;
-       stat->ctime = inode->i_ctime;
+       generic_fillattr(inode, stat);
        stat->blksize = UBIFS_BLOCK_SIZE;
        stat->size = ui->ui_size;
 
index a165c66e3eef2249379890c60a4c7d4111e8df4d..18024178ac4c040a3f23181ff2dc1a5cc48f2dc8 100644 (file)
@@ -1260,16 +1260,15 @@ static struct dentry *udf_fh_to_parent(struct super_block *sb,
                                 fid->udf.parent_partref,
                                 fid->udf.parent_generation);
 }
-static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
-                        int connectable)
+static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+                        struct inode *parent)
 {
        int len = *lenp;
-       struct inode *inode =  de->d_inode;
        struct kernel_lb_addr location = UDF_I(inode)->i_location;
        struct fid *fid = (struct fid *)fh;
        int type = FILEID_UDF_WITHOUT_PARENT;
 
-       if (connectable && (len < 5)) {
+       if (parent && (len < 5)) {
                *lenp = 5;
                return 255;
        } else if (len < 3) {
@@ -1282,14 +1281,11 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
        fid->udf.partref = location.partitionReferenceNum;
        fid->udf.generation = inode->i_generation;
 
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               spin_lock(&de->d_lock);
-               inode = de->d_parent->d_inode;
-               location = UDF_I(inode)->i_location;
+       if (parent) {
+               location = UDF_I(parent)->i_location;
                fid->udf.parent_block = location.logicalBlockNum;
                fid->udf.parent_partref = location.partitionReferenceNum;
                fid->udf.parent_generation = inode->i_generation;
-               spin_unlock(&de->d_lock);
                *lenp = 5;
                type = FILEID_UDF_WITH_PARENT;
        }
index ba653f3dc1bc9c66010290e53e0bb2a5b8fd94b1..fa4dbe451e278eab0f52bbacc110b157a300cdad 100644 (file)
@@ -140,18 +140,19 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
                goto out;
 
        if (filename == NULL && dfd != AT_FDCWD) {
+               int fput_needed;
                struct file *file;
 
                if (flags & AT_SYMLINK_NOFOLLOW)
                        goto out;
 
-               file = fget(dfd);
+               file = fget_light(dfd, &fput_needed);
                error = -EBADF;
                if (!file)
                        goto out;
 
                error = utimes_common(&file->f_path, times);
-               fput(file);
+               fput_light(file, fput_needed);
        } else {
                struct path path;
                int lookup_flags = 0;
index 3c8c1cc333c7c79dfa105049a62d8b6e1c28661c..1d7ac379045879b827b196f0d7a7420fe33c783d 100644 (file)
@@ -399,11 +399,12 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
 SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                const void __user *,value, size_t, size, int, flags)
 {
+       int fput_needed;
        struct file *f;
        struct dentry *dentry;
        int error = -EBADF;
 
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
        if (!f)
                return error;
        dentry = f->f_path.dentry;
@@ -413,7 +414,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                error = setxattr(dentry, name, value, size, flags);
                mnt_drop_write_file(f);
        }
-       fput(f);
+       fput_light(f, fput_needed);
        return error;
 }
 
@@ -486,15 +487,16 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
 SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
                void __user *, value, size_t, size)
 {
+       int fput_needed;
        struct file *f;
        ssize_t error = -EBADF;
 
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
        if (!f)
                return error;
        audit_inode(NULL, f->f_path.dentry);
        error = getxattr(f->f_path.dentry, name, value, size);
-       fput(f);
+       fput_light(f, fput_needed);
        return error;
 }
 
@@ -566,15 +568,16 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
 
 SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
 {
+       int fput_needed;
        struct file *f;
        ssize_t error = -EBADF;
 
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
        if (!f)
                return error;
        audit_inode(NULL, f->f_path.dentry);
        error = listxattr(f->f_path.dentry, list, size);
-       fput(f);
+       fput_light(f, fput_needed);
        return error;
 }
 
@@ -634,11 +637,12 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
 
 SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 {
+       int fput_needed;
        struct file *f;
        struct dentry *dentry;
        int error = -EBADF;
 
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
        if (!f)
                return error;
        dentry = f->f_path.dentry;
@@ -648,7 +652,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
                error = removexattr(dentry, name);
                mnt_drop_write_file(f);
        }
-       fput(f);
+       fput_light(f, fput_needed);
        return error;
 }
 
index a907de565db3bf287f7d1a7894fff23f85ca18d5..4a7286c1dc80d270af40a3733870bb9dd769ee82 100644 (file)
@@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
 }
 
 void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
+kmem_alloc(size_t size, xfs_km_flags_t flags)
 {
        int     retries = 0;
        gfp_t   lflags = kmem_flags_convert(flags);
@@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
 }
 
 void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
 {
        void    *ptr;
 
@@ -87,7 +87,7 @@ kmem_free(const void *ptr)
 
 void *
 kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
-            unsigned int __nocast flags)
+            xfs_km_flags_t flags)
 {
        void    *new;
 
@@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
 }
 
 void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
 {
        int     retries = 0;
        gfp_t   lflags = kmem_flags_convert(flags);
@@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
 }
 
 void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
 {
        void    *ptr;
 
index ab7c53fe346e2273311a1b73bee866ab8c4f8d7f..b2f2620f9a87b9f1bf6836c8faf3d5039e7af94f 100644 (file)
  * General memory allocation interfaces
  */
 
-#define KM_SLEEP       0x0001u
-#define KM_NOSLEEP     0x0002u
-#define KM_NOFS                0x0004u
-#define KM_MAYFAIL     0x0008u
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_SLEEP       ((__force xfs_km_flags_t)0x0001u)
+#define KM_NOSLEEP     ((__force xfs_km_flags_t)0x0002u)
+#define KM_NOFS                ((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL     ((__force xfs_km_flags_t)0x0008u)
 
 /*
  * We use a special process flag to avoid recursive callbacks into
@@ -38,7 +39,7 @@
  * warnings, so we explicitly skip any generic ones (silly of us).
  */
 static inline gfp_t
-kmem_flags_convert(unsigned int __nocast flags)
+kmem_flags_convert(xfs_km_flags_t flags)
 {
        gfp_t   lflags;
 
@@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags)
        return lflags;
 }
 
-extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc(size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
 extern void  kmem_free(const void *);
 
 static inline void *kmem_zalloc_large(size_t size)
@@ -107,7 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone)
                kmem_cache_destroy(zone);
 }
 
-extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
+extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
 
 #endif /* __XFS_SUPPORT_KMEM_H__ */
index 2d25d19c4ea17b991fa4a43dcbb340e5c957c461..42679223a0fde641e3013980fbd1e733dc6ec60e 100644 (file)
@@ -52,19 +52,18 @@ static int xfs_fileid_length(int fileid_type)
 
 STATIC int
 xfs_fs_encode_fh(
-       struct dentry           *dentry,
-       __u32                   *fh,
-       int                     *max_len,
-       int                     connectable)
+       struct inode    *inode,
+       __u32           *fh,
+       int             *max_len,
+       struct inode    *parent)
 {
        struct fid              *fid = (struct fid *)fh;
        struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fh;
-       struct inode            *inode = dentry->d_inode;
        int                     fileid_type;
        int                     len;
 
        /* Directories don't need their parent encoded, they have ".." */
-       if (S_ISDIR(inode->i_mode) || !connectable)
+       if (!parent)
                fileid_type = FILEID_INO32_GEN;
        else
                fileid_type = FILEID_INO32_GEN_PARENT;
@@ -96,20 +95,16 @@ xfs_fs_encode_fh(
 
        switch (fileid_type) {
        case FILEID_INO32_GEN_PARENT:
-               spin_lock(&dentry->d_lock);
-               fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
-               fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
-               spin_unlock(&dentry->d_lock);
+               fid->i32.parent_ino = XFS_I(parent)->i_ino;
+               fid->i32.parent_gen = parent->i_generation;
                /*FALLTHRU*/
        case FILEID_INO32_GEN:
                fid->i32.ino = XFS_I(inode)->i_ino;
                fid->i32.gen = inode->i_generation;
                break;
        case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
-               spin_lock(&dentry->d_lock);
-               fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
-               fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
-               spin_unlock(&dentry->d_lock);
+               fid64->parent_ino = XFS_I(parent)->i_ino;
+               fid64->parent_gen = parent->i_generation;
                /*FALLTHRU*/
        case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
                fid64->ino = XFS_I(inode)->i_ino;
index 8d214b87f6bb06ed1f7ed204cdfda8da9345172f..9f7ec15a65222e2fe318e0ab81ac9cca0a664b4a 100644 (file)
@@ -586,8 +586,11 @@ restart:
         * lock above.  Eventually we should look into a way to avoid
         * the pointless lock roundtrip.
         */
-       if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-               file_update_time(file);
+       if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+               error = file_update_time(file);
+               if (error)
+                       return error;
+       }
 
        /*
         * If we're writing the file then make sure to clear the setuid and
index 6b965bf450e44d5972fc689d486deec3dd5c8094..f30d9807dc48a0535084da1afd5a7620389adcd5 100644 (file)
@@ -3152,7 +3152,7 @@ xlog_ticket_alloc(
        int             cnt,
        char            client,
        bool            permanent,
-       int             alloc_flags)
+       xfs_km_flags_t  alloc_flags)
 {
        struct xlog_ticket *tic;
        uint            num_headers;
index 735ff1ee53da447eee9c5b88d54e9007ee988138..5bc33261f5be6311fb5732f42db25e17064abded 100644 (file)
@@ -555,7 +555,7 @@ extern void  xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
 extern kmem_zone_t *xfs_log_ticket_zone;
 struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
                                int count, char client, bool permanent,
-                               int alloc_flags);
+                               xfs_km_flags_t alloc_flags);
 
 
 static inline void
index cdf896fcbfa43810c83bfbc7a99f84f118a4d75d..fdf324508c5ee467c6055f0866e1be88c387942d 100644 (file)
@@ -584,7 +584,7 @@ xfs_trans_t *
 _xfs_trans_alloc(
        xfs_mount_t     *mp,
        uint            type,
-       uint            memflags)
+       xfs_km_flags_t  memflags)
 {
        xfs_trans_t     *tp;
 
index 7ab99e1898c8de10e875aff23599d140c9867c4b..7c37b533aa8e5c169f0ef98643f96958df3788df 100644 (file)
@@ -443,7 +443,7 @@ typedef struct xfs_trans {
  * XFS transaction mechanism exported interfaces.
  */
 xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
 xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
 int            xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
                                  uint, uint);
index 91d44bd4dde32574bb6365a5526ac33a19992050..fe74fccf18db75742d240151ec358049861b7406 100644 (file)
@@ -23,10 +23,6 @@ typedef __kernel_ulong_t __kernel_ino_t;
 typedef unsigned int   __kernel_mode_t;
 #endif
 
-#ifndef __kernel_nlink_t
-typedef __kernel_ulong_t __kernel_nlink_t;
-#endif
-
 #ifndef __kernel_pid_t
 typedef int            __kernel_pid_t;
 #endif
index 2d09bfa5c2628a3e1e396350b7d9d14f1a2791c8..e0de516374da37de6a95c35e79ab7cabb899d177 100644 (file)
@@ -17,6 +17,7 @@
 #define ENOIOCTLCMD    515     /* No ioctl command */
 #define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
 #define EPROBE_DEFER   517     /* Driver requests probe retry */
+#define EOPENSTALE     518     /* open found a stale dentry */
 
 /* Defined for the NFSv3 protocol */
 #define EBADHANDLE     521     /* Illegal NFS file handle */
index 3a4cef5322dcab4d6b50b96243fa7187b2da1ebd..12291a7ee2759164026ac602ab1712b5d66faef0 100644 (file)
@@ -165,8 +165,8 @@ struct fid {
  */
 
 struct export_operations {
-       int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len,
-                       int connectable);
+       int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
+                       struct inode *parent);
        struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid,
                        int fh_len, int fh_type);
        struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid,
index 40887afaaca7008c3becca920313ffbc555a9fd5..51978ed43e973ccf70996c19c33820fd39edeabe 100644 (file)
@@ -1692,6 +1692,7 @@ struct inode_operations {
        int (*removexattr) (struct dentry *, const char *);
        int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                      u64 len);
+       int (*update_time)(struct inode *, struct timespec *, int);
 } ____cacheline_aligned;
 
 struct seq_file;
@@ -1850,6 +1851,13 @@ static inline void inode_inc_iversion(struct inode *inode)
        spin_unlock(&inode->i_lock);
 }
 
+enum file_time_flags {
+       S_ATIME = 1,
+       S_MTIME = 2,
+       S_CTIME = 4,
+       S_VERSION = 8,
+};
+
 extern void touch_atime(struct path *);
 static inline void file_accessed(struct file *file)
 {
@@ -2583,7 +2591,7 @@ extern int inode_change_ok(const struct inode *, struct iattr *);
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
 extern void setattr_copy(struct inode *inode, const struct iattr *attr);
 
-extern void file_update_time(struct file *file);
+extern int file_update_time(struct file *file);
 
 extern int generic_show_options(struct seq_file *m, struct dentry *root);
 extern void save_mount_options(struct super_block *sb, char *options);
index 91d0e0a34ef3185a6051d8394cab63dfb76a04cb..63d966d5c2ea7a382c2f42cc664c7804dec86f73 100644 (file)
@@ -60,7 +60,7 @@
 #define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
                                   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
                                   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
-                                  FS_DELETE)
+                                  FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM)
 
 #define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)
 
index 912c30a8ddb1e47cd732fbd95f281238ca601ae0..f334c7fab96762ab4131c9886df87d4d6d4dde9d 100644 (file)
@@ -31,6 +31,7 @@
 #include <linux/mutex.h>
 #include <linux/timer.h>
 #include <linux/slab.h>
+#include <crypto/hash.h>
 #endif
 
 #define journal_oom_retry 1
@@ -147,12 +148,24 @@ typedef struct journal_header_s
 #define JBD2_CRC32_CHKSUM   1
 #define JBD2_MD5_CHKSUM     2
 #define JBD2_SHA1_CHKSUM    3
+#define JBD2_CRC32C_CHKSUM  4
 
 #define JBD2_CRC32_CHKSUM_SIZE 4
 
 #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
 /*
  * Commit block header for storing transactional checksums:
+ *
+ * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum*
+ * fields are used to store a checksum of the descriptor and data blocks.
+ *
+ * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum
+ * field is used to store crc32c(uuid+commit_block).  Each journal metadata
+ * block gets its own checksum, and data block checksums are stored in
+ * journal_block_tag (in the descriptor).  The other h_chksum* fields are
+ * not used.
+ *
+ * Checksum v1 and v2 are mutually exclusive features.
  */
 struct commit_header {
        __be32          h_magic;
@@ -175,13 +188,19 @@ struct commit_header {
 typedef struct journal_block_tag_s
 {
        __be32          t_blocknr;      /* The on-disk block number */
-       __be32          t_flags;        /* See below */
+       __be16          t_checksum;     /* truncated crc32c(uuid+seq+block) */
+       __be16          t_flags;        /* See below */
        __be32          t_blocknr_high; /* most-significant high 32bits. */
 } journal_block_tag_t;
 
 #define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high))
 #define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t))
 
+/* Tail of descriptor block, for checksumming */
+struct jbd2_journal_block_tail {
+       __be32          t_checksum;     /* crc32c(uuid+descr_block) */
+};
+
 /*
  * The revoke descriptor: used on disk to describe a series of blocks to
  * be revoked from the log
@@ -192,6 +211,10 @@ typedef struct jbd2_journal_revoke_header_s
        __be32           r_count;       /* Count of bytes used in the block */
 } jbd2_journal_revoke_header_t;
 
+/* Tail of revoke block, for checksumming */
+struct jbd2_journal_revoke_tail {
+       __be32          r_checksum;     /* crc32c(uuid+revoke_block) */
+};
 
 /* Definitions for the journal tag flags word: */
 #define JBD2_FLAG_ESCAPE               1       /* on-disk block is escaped */
@@ -241,7 +264,10 @@ typedef struct journal_superblock_s
        __be32  s_max_trans_data;       /* Limit of data blocks per trans. */
 
 /* 0x0050 */
-       __u32   s_padding[44];
+       __u8    s_checksum_type;        /* checksum type */
+       __u8    s_padding2[3];
+       __u32   s_padding[42];
+       __be32  s_checksum;             /* crc32c(superblock) */
 
 /* 0x0100 */
        __u8    s_users[16*48];         /* ids of all fs'es sharing the log */
@@ -263,13 +289,15 @@ typedef struct journal_superblock_s
 #define JBD2_FEATURE_INCOMPAT_REVOKE           0x00000001
 #define JBD2_FEATURE_INCOMPAT_64BIT            0x00000002
 #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT     0x00000004
+#define JBD2_FEATURE_INCOMPAT_CSUM_V2          0x00000008
 
 /* Features known to this kernel version: */
 #define JBD2_KNOWN_COMPAT_FEATURES     JBD2_FEATURE_COMPAT_CHECKSUM
 #define JBD2_KNOWN_ROCOMPAT_FEATURES   0
 #define JBD2_KNOWN_INCOMPAT_FEATURES   (JBD2_FEATURE_INCOMPAT_REVOKE | \
                                        JBD2_FEATURE_INCOMPAT_64BIT | \
-                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)
+                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
+                                       JBD2_FEATURE_INCOMPAT_CSUM_V2)
 
 #ifdef __KERNEL__
 
@@ -939,6 +967,12 @@ struct journal_s
         * superblock pointer here
         */
        void *j_private;
+
+       /* Reference to checksum algorithm driver via cryptoapi */
+       struct crypto_shash *j_chksum_driver;
+
+       /* Precomputed journal UUID checksum for seeding other checksums */
+       __u32 j_csum_seed;
 };
 
 /*
@@ -1268,6 +1302,25 @@ static inline int jbd_space_needed(journal_t *journal)
 
 extern int jbd_blocks_per_page(struct inode *inode);
 
+static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
+                             const void *address, unsigned int length)
+{
+       struct {
+               struct shash_desc shash;
+               char ctx[crypto_shash_descsize(journal->j_chksum_driver)];
+       } desc;
+       int err;
+
+       desc.shash.tfm = journal->j_chksum_driver;
+       desc.shash.flags = 0;
+       *(u32 *)desc.ctx = crc;
+
+       err = crypto_shash_update(&desc.shash, address, length);
+       BUG_ON(err);
+
+       return *(u32 *)desc.ctx;
+}
+
 #ifdef __KERNEL__
 
 #define buffer_trace_init(bh)  do {} while (0)
index 6230f8556a4eeac37bcaaa83a4cc913177e09c56..6133679bc4c01ace20a0114fd50ff7c3481c7eb9 100644 (file)
@@ -12,6 +12,7 @@ enum jbd_state_bits {
        BH_State,               /* Pins most journal_head state */
        BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
        BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
+       BH_Verified,            /* Metadata block has been verified ok */
        BH_JBDPrivateStart,     /* First bit available for private use by FS */
 };
 
@@ -24,6 +25,7 @@ TAS_BUFFER_FNS(Revoked, revoked)
 BUFFER_FNS(RevokeValid, revokevalid)
 TAS_BUFFER_FNS(RevokeValid, revokevalid)
 BUFFER_FNS(Freed, freed)
+BUFFER_FNS(Verified, verified)
 
 static inline struct buffer_head *jh2bh(struct journal_head *jh)
 {
index 87f402ccec55567330943ab774ffb12ae21c7da8..f01e5f6d1f07a4966927bb7acd5707f8f77904c8 100644 (file)
 #include <linux/lockdep.h>
 #include <linux/percpu.h>
 #include <linux/cpu.h>
+#include <linux/notifier.h>
 
 /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)     name##_lock_init()
-#define br_read_lock(name)     name##_local_lock()
-#define br_read_unlock(name)   name##_local_unlock()
-#define br_write_lock(name)    name##_global_lock_online()
-#define br_write_unlock(name)  name##_global_unlock_online()
+#define br_lock_init(name)     lg_lock_init(name, #name)
+#define br_read_lock(name)     lg_local_lock(name)
+#define br_read_unlock(name)   lg_local_unlock(name)
+#define br_write_lock(name)    lg_global_lock(name)
+#define br_write_unlock(name)  lg_global_unlock(name)
 
-#define DECLARE_BRLOCK(name)   DECLARE_LGLOCK(name)
 #define DEFINE_BRLOCK(name)    DEFINE_LGLOCK(name)
 
-
-#define lg_lock_init(name)     name##_lock_init()
-#define lg_local_lock(name)    name##_local_lock()
-#define lg_local_unlock(name)  name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)   name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu) name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)   name##_global_lock()
-#define lg_global_unlock(name) name##_global_unlock()
-#define lg_global_lock_online(name) name##_global_lock_online()
-#define lg_global_unlock_online(name) name##_global_unlock_online()
-
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #define LOCKDEP_INIT_MAP lockdep_init_map
 
 #define DEFINE_LGLOCK_LOCKDEP(name)
 #endif
 
-
-#define DECLARE_LGLOCK(name)                                           \
- extern void name##_lock_init(void);                                   \
- extern void name##_local_lock(void);                                  \
- extern void name##_local_unlock(void);                                        \
- extern void name##_local_lock_cpu(int cpu);                           \
- extern void name##_local_unlock_cpu(int cpu);                         \
- extern void name##_global_lock(void);                                 \
- extern void name##_global_unlock(void);                               \
- extern void name##_global_lock_online(void);                          \
- extern void name##_global_unlock_online(void);                                \
+struct lglock {
+       arch_spinlock_t __percpu *lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lock_class_key lock_key;
+       struct lockdep_map    lock_dep_map;
+#endif
+};
 
 #define DEFINE_LGLOCK(name)                                            \
-                                                                       \
- DEFINE_SPINLOCK(name##_cpu_lock);                                     \
- cpumask_t name##_cpus __read_mostly;                                  \
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);                         \
- DEFINE_LGLOCK_LOCKDEP(name);                                          \
-                                                                       \
- static int                                                            \
- name##_lg_cpu_callback(struct notifier_block *nb,                     \
-                               unsigned long action, void *hcpu)       \
- {                                                                     \
-       switch (action & ~CPU_TASKS_FROZEN) {                           \
-       case CPU_UP_PREPARE:                                            \
-               spin_lock(&name##_cpu_lock);                            \
-               cpu_set((unsigned long)hcpu, name##_cpus);              \
-               spin_unlock(&name##_cpu_lock);                          \
-               break;                                                  \
-       case CPU_UP_CANCELED: case CPU_DEAD:                            \
-               spin_lock(&name##_cpu_lock);                            \
-               cpu_clear((unsigned long)hcpu, name##_cpus);            \
-               spin_unlock(&name##_cpu_lock);                          \
-       }                                                               \
-       return NOTIFY_OK;                                               \
- }                                                                     \
- static struct notifier_block name##_lg_cpu_notifier = {               \
-       .notifier_call = name##_lg_cpu_callback,                        \
- };                                                                    \
- void name##_lock_init(void) {                                         \
-       int i;                                                          \
-       LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;     \
-       }                                                               \
-       register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
-       get_online_cpus();                                              \
-       for_each_online_cpu(i)                                          \
-               cpu_set(i, name##_cpus);                                \
-       put_online_cpus();                                              \
- }                                                                     \
- EXPORT_SYMBOL(name##_lock_init);                                      \
-                                                                       \
- void name##_local_lock(void) {                                                \
-       arch_spinlock_t *lock;                                          \
-       preempt_disable();                                              \
-       rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
-       lock = &__get_cpu_var(name##_lock);                             \
-       arch_spin_lock(lock);                                           \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_lock);                                     \
-                                                                       \
- void name##_local_unlock(void) {                                      \
-       arch_spinlock_t *lock;                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
-       lock = &__get_cpu_var(name##_lock);                             \
-       arch_spin_unlock(lock);                                         \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_unlock);                                   \
-                                                                       \
- void name##_local_lock_cpu(int cpu) {                                 \
-       arch_spinlock_t *lock;                                          \
-       preempt_disable();                                              \
-       rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
-       lock = &per_cpu(name##_lock, cpu);                              \
-       arch_spin_lock(lock);                                           \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_lock_cpu);                                 \
-                                                                       \
- void name##_local_unlock_cpu(int cpu) {                               \
-       arch_spinlock_t *lock;                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
-       lock = &per_cpu(name##_lock, cpu);                              \
-       arch_spin_unlock(lock);                                         \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_unlock_cpu);                               \
-                                                                       \
- void name##_global_lock_online(void) {                                        \
-       int i;                                                          \
-       spin_lock(&name##_cpu_lock);                                    \
-       rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
-       for_each_cpu(i, &name##_cpus) {                                 \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_lock(lock);                                   \
-       }                                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_lock_online);                             \
-                                                                       \
- void name##_global_unlock_online(void) {                              \
-       int i;                                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
-       for_each_cpu(i, &name##_cpus) {                                 \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_unlock(lock);                                 \
-       }                                                               \
-       spin_unlock(&name##_cpu_lock);                                  \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_unlock_online);                           \
-                                                                       \
- void name##_global_lock(void) {                                       \
-       int i;                                                          \
-       preempt_disable();                                              \
-       rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_lock(lock);                                   \
-       }                                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_lock);                                    \
-                                                                       \
- void name##_global_unlock(void) {                                     \
-       int i;                                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_unlock(lock);                                 \
-       }                                                               \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_unlock);
+       DEFINE_LGLOCK_LOCKDEP(name);                                    \
+       DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)                  \
+       = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
+       struct lglock name = { .lock = &name ## _lock }
+
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
 #endif
index ce26716238c3632ba95d909bf69e1e4d4bc91da2..b36d08ce5c578dcd18e224828217ded481de54ee 100644 (file)
@@ -1392,7 +1392,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
        unsigned long len, unsigned long flags,
        vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap(struct file *, unsigned long,
+extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
         unsigned long, unsigned long,
         unsigned long, unsigned long);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t);
index ab0e091ce5facf0047c57191f9e631fd5c4bb791..4e5a73cdbbef18463920022626931d02c0540eb9 100644 (file)
@@ -86,9 +86,9 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
 extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
 extern int cap_inode_need_killpriv(struct dentry *dentry);
 extern int cap_inode_killpriv(struct dentry *dentry);
-extern int cap_file_mmap(struct file *file, unsigned long reqprot,
-                        unsigned long prot, unsigned long flags,
-                        unsigned long addr, unsigned long addr_only);
+extern int cap_mmap_addr(unsigned long addr);
+extern int cap_mmap_file(struct file *file, unsigned long reqprot,
+                        unsigned long prot, unsigned long flags);
 extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
 extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                          unsigned long arg4, unsigned long arg5);
@@ -586,15 +586,17 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
  *     simple integer value.  When @arg represents a user space pointer, it
  *     should never be used by the security module.
  *     Return 0 if permission is granted.
- * @file_mmap :
+ * @mmap_addr :
+ *     Check permissions for a mmap operation at @addr.
+ *     @addr contains virtual address that will be used for the operation.
+ *     Return 0 if permission is granted.
+ * @mmap_file :
  *     Check permissions for a mmap operation.  The @file may be NULL, e.g.
  *     if mapping anonymous memory.
  *     @file contains the file structure for file to map (may be NULL).
  *     @reqprot contains the protection requested by the application.
  *     @prot contains the protection that will be applied by the kernel.
  *     @flags contains the operational flags.
- *     @addr contains virtual address that will be used for the operation.
- *     @addr_only contains a boolean: 0 if file-backed VMA, otherwise 1.
  *     Return 0 if permission is granted.
  * @file_mprotect:
  *     Check permissions before changing memory access permissions.
@@ -1481,10 +1483,10 @@ struct security_operations {
        void (*file_free_security) (struct file *file);
        int (*file_ioctl) (struct file *file, unsigned int cmd,
                           unsigned long arg);
-       int (*file_mmap) (struct file *file,
+       int (*mmap_addr) (unsigned long addr);
+       int (*mmap_file) (struct file *file,
                          unsigned long reqprot, unsigned long prot,
-                         unsigned long flags, unsigned long addr,
-                         unsigned long addr_only);
+                         unsigned long flags);
        int (*file_mprotect) (struct vm_area_struct *vma,
                              unsigned long reqprot,
                              unsigned long prot);
@@ -1743,9 +1745,9 @@ int security_file_permission(struct file *file, int mask);
 int security_file_alloc(struct file *file);
 void security_file_free(struct file *file);
 int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int security_file_mmap(struct file *file, unsigned long reqprot,
-                       unsigned long prot, unsigned long flags,
-                       unsigned long addr, unsigned long addr_only);
+int security_mmap_file(struct file *file, unsigned long prot,
+                       unsigned long flags);
+int security_mmap_addr(unsigned long addr);
 int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                           unsigned long prot);
 int security_file_lock(struct file *file, unsigned int cmd);
@@ -2181,13 +2183,15 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd,
        return 0;
 }
 
-static inline int security_file_mmap(struct file *file, unsigned long reqprot,
-                                    unsigned long prot,
-                                    unsigned long flags,
-                                    unsigned long addr,
-                                    unsigned long addr_only)
+static inline int security_mmap_file(struct file *file, unsigned long prot,
+                                    unsigned long flags)
+{
+       return 0;
+}
+
+static inline int security_mmap_addr(unsigned long addr)
 {
-       return cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
+       return cap_mmap_addr(addr);
 }
 
 static inline int security_file_mprotect(struct vm_area_struct *vma,
index 51b29ac45a8e7b26583df0217ab37a0d939ad6da..40e0a273faea3c07470e19fd23673fda89543f9b 100644 (file)
@@ -232,7 +232,6 @@ struct svc_rqst {
        struct svc_pool *       rq_pool;        /* thread pool */
        struct svc_procedure *  rq_procinfo;    /* procedure info */
        struct auth_ops *       rq_authop;      /* authentication flavour */
-       u32                     rq_flavor;      /* pseudoflavor */
        struct svc_cred         rq_cred;        /* auth info */
        void *                  rq_xprt_ctxt;   /* transport specific context ptr */
        struct svc_deferred_req*rq_deferred;    /* deferred request we are replaying */
@@ -416,6 +415,7 @@ struct svc_procedure {
  */
 int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
 void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
+int svc_bind(struct svc_serv *serv, struct net *net);
 struct svc_serv *svc_create(struct svc_program *, unsigned int,
                            void (*shutdown)(struct svc_serv *, struct net *net));
 struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
index 2c54683b91decae417967b5a0703c96d84de9714..dd74084a9799891309f54db25b8259ae3388f3c8 100644 (file)
 #include <linux/sunrpc/msg_prot.h>
 #include <linux/sunrpc/cache.h>
 #include <linux/hash.h>
+#include <linux/cred.h>
 
 struct svc_cred {
        uid_t                   cr_uid;
        gid_t                   cr_gid;
        struct group_info       *cr_group_info;
+       u32                     cr_flavor; /* pseudoflavor */
+       char                    *cr_principal; /* for gss */
 };
 
+static inline void free_svc_cred(struct svc_cred *cred)
+{
+       if (cred->cr_group_info)
+               put_group_info(cred->cr_group_info);
+       kfree(cred->cr_principal);
+}
+
 struct svc_rqst;               /* forward decl */
 struct in6_addr;
 
index 7c32daa025eb07b644d8185a27c8ea10d8b7c55f..726aff1a52011fcdfd3ab1e11b8a82ff1dbea703 100644 (file)
@@ -22,7 +22,6 @@ int gss_svc_init_net(struct net *net);
 void gss_svc_shutdown_net(struct net *net);
 int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
 u32 svcauth_gss_flavor(struct auth_domain *dom);
-char *svc_gss_principal(struct svc_rqst *);
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
index 7f480db60231a714b9e520f3a16856c5d4e4a5e1..9c1bd539ea70e780e0e926b54bfc9320d3ec34a4 100644 (file)
@@ -25,7 +25,7 @@ typedef __kernel_dev_t                dev_t;
 typedef __kernel_ino_t         ino_t;
 typedef __kernel_mode_t                mode_t;
 typedef unsigned short         umode_t;
-typedef __kernel_nlink_t       nlink_t;
+typedef __u32                  nlink_t;
 typedef __kernel_off_t         off_t;
 typedef __kernel_pid_t         pid_t;
 typedef __kernel_daddr_t       daddr_t;
index 406c5b208193373b979ce82bffe6617250ea64ed..5e2cbfdab6fc0d6b96a19c321a9208dda8cd130d 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1036,6 +1036,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
        sfd->file = shp->shm_file;
        sfd->vm_ops = NULL;
 
+       err = security_mmap_file(file, prot, flags);
+       if (err)
+               goto out_fput;
+
        down_write(&current->mm->mmap_sem);
        if (addr && !(shmflg & SHM_REMAP)) {
                err = -EINVAL;
@@ -1050,7 +1054,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
                        goto invalid;
        }
                
-       user_addr = do_mmap (file, addr, size, prot, flags, 0);
+       user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
        *raddr = user_addr;
        err = 0;
        if (IS_ERR_VALUE(user_addr))
@@ -1058,6 +1062,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 invalid:
        up_write(&current->mm->mmap_sem);
 
+out_fput:
        fput(file);
 
 out_nattch:
index 6f3d0ae044b24769c1caa61346174bc44a3684bc..c0cc67ad764ceddbe9f226ee1bfb90c4055f19ff 100644 (file)
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
            notifier.o ksysfs.o cred.o \
-           async.o range.o groups.o
+           async.o range.o groups.o lglock.o
 
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
diff --git a/kernel/lglock.c b/kernel/lglock.c
new file mode 100644 (file)
index 0000000..6535a66
--- /dev/null
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+       LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+       arch_spinlock_t *lock;
+
+       preempt_disable();
+       rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       lock = this_cpu_ptr(lg->lock);
+       arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+       arch_spinlock_t *lock;
+
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       lock = this_cpu_ptr(lg->lock);
+       arch_spin_unlock(lock);
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+       arch_spinlock_t *lock;
+
+       preempt_disable();
+       rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       lock = per_cpu_ptr(lg->lock, cpu);
+       arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+       arch_spinlock_t *lock;
+
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       lock = per_cpu_ptr(lg->lock, cpu);
+       arch_spin_unlock(lock);
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+       int i;
+
+       preempt_disable();
+       rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       for_each_possible_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_lock(lock);
+       }
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+       int i;
+
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       for_each_possible_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_unlock(lock);
+       }
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
index 5646c740f613ed1ec8b34a094f76d7934eed1aac..32e6f4136fa2297e13a6ac51444d50c18b78e9a3 100644 (file)
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
 static int cleancache_get_key(struct inode *inode,
                              struct cleancache_filekey *key)
 {
-       int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+       int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
        int len = 0, maxlen = CLEANCACHE_KEY_MAX;
        struct super_block *sb = inode->i_sb;
 
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
        if (sb->s_export_op != NULL) {
                fhfn = sb->s_export_op->encode_fh;
                if  (fhfn) {
-                       struct dentry d;
-                       d.d_inode = inode;
-                       len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+                       len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
                        if (len <= 0 || len == 255)
                                return -1;
                        if (maxlen > CLEANCACHE_KEY_MAX)
index 64b48f934b897451154e5517b38704fb19e4f86a..a4a5260b0279b77b37738540b1e8c24fb446a3e5 100644 (file)
@@ -1899,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(read_cache_page);
 
-/*
- * The logic we want is
- *
- *     if suid or (sgid and xgrp)
- *             remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
-       umode_t mode = dentry->d_inode->i_mode;
-       int kill = 0;
-
-       /* suid always must be killed */
-       if (unlikely(mode & S_ISUID))
-               kill = ATTR_KILL_SUID;
-
-       /*
-        * sgid without any exec bits is just a mandatory locking mark; leave
-        * it alone.  If some exec bits are set, it's a real sgid; kill it.
-        */
-       if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
-               kill |= ATTR_KILL_SGID;
-
-       if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
-               return kill;
-
-       return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-static int __remove_suid(struct dentry *dentry, int kill)
-{
-       struct iattr newattrs;
-
-       newattrs.ia_valid = ATTR_FORCE | kill;
-       return notify_change(dentry, &newattrs);
-}
-
-int file_remove_suid(struct file *file)
-{
-       struct dentry *dentry = file->f_path.dentry;
-       struct inode *inode = dentry->d_inode;
-       int killsuid;
-       int killpriv;
-       int error = 0;
-
-       /* Fast path for nothing security related */
-       if (IS_NOSEC(inode))
-               return 0;
-
-       killsuid = should_remove_suid(dentry);
-       killpriv = security_inode_need_killpriv(dentry);
-
-       if (killpriv < 0)
-               return killpriv;
-       if (killpriv)
-               error = security_inode_killpriv(dentry);
-       if (!error && killsuid)
-               error = __remove_suid(dentry, killsuid);
-       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
-               inode->i_flags |= S_NOSEC;
-
-       return error;
-}
-EXPORT_SYMBOL(file_remove_suid);
-
 static size_t __iovec_copy_from_user_inatomic(char *vaddr,
                        const struct iovec *iov, size_t base, size_t bytes)
 {
@@ -2489,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        if (err)
                goto out;
 
-       file_update_time(file);
+       err = file_update_time(file);
+       if (err)
+               goto out;
 
        /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
        if (unlikely(file->f_flags & O_DIRECT)) {
index a4eb3113222912c9aada14bd92c6b68d01577b73..213ca1f5340980e1ce6fad8d4f12e50858d61397 100644 (file)
@@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
        if (ret)
                goto out_backing;
 
-       file_update_time(filp);
+       ret = file_update_time(filp);
+       if (ret)
+               goto out_backing;
 
        ret = __xip_file_write (filp, buf, count, pos, ppos);
 
index 4194ab9dc19b412aa8e15f1b89612aa2595f0c9f..5cbb78190041573ee4e92e77a12a8f129514862f 100644 (file)
@@ -350,3 +350,7 @@ extern u64 hwpoison_filter_flags_mask;
 extern u64 hwpoison_filter_flags_value;
 extern u64 hwpoison_filter_memcg;
 extern u32 hwpoison_filter_enable;
+
+extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+        unsigned long, unsigned long,
+        unsigned long, unsigned long);
index 4a9c2a391e28efe523e1a3aaaf9ab06d24627e86..3edfcdfa42d9f27a5238780065220ec3b4fc702a 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -971,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
  * The caller must hold down_write(&current->mm->mmap_sem).
  */
 
-static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                        unsigned long len, unsigned long prot,
                        unsigned long flags, unsigned long pgoff)
 {
        struct mm_struct * mm = current->mm;
        struct inode *inode;
        vm_flags_t vm_flags;
-       int error;
-       unsigned long reqprot = prot;
 
        /*
         * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1101,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                }
        }
 
-       error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
-       if (error)
-               return error;
-
        return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
 
-unsigned long do_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       if (unlikely(offset + PAGE_ALIGN(len) < offset))
-               return -EINVAL;
-       if (unlikely(offset & ~PAGE_MASK))
-               return -EINVAL;
-       return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(do_mmap);
-
-unsigned long vm_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       unsigned long ret;
-       struct mm_struct *mm = current->mm;
-
-       down_write(&mm->mmap_sem);
-       ret = do_mmap(file, addr, len, prot, flag, offset);
-       up_write(&mm->mmap_sem);
-       return ret;
-}
-EXPORT_SYMBOL(vm_mmap);
-
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
@@ -1165,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-       down_write(&current->mm->mmap_sem);
-       retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-       up_write(&current->mm->mmap_sem);
-
+       retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
        if (file)
                fput(file);
 out:
@@ -1629,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
        if (addr & ~PAGE_MASK)
                return -EINVAL;
 
-       return arch_rebalance_pgtables(addr, len);
+       addr = arch_rebalance_pgtables(addr, len);
+       error = security_mmap_addr(addr);
+       return error ? error : addr;
 }
 
 EXPORT_SYMBOL(get_unmapped_area);
@@ -1819,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma,
                return -ENOMEM;
 
        address &= PAGE_MASK;
-       error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+       error = security_mmap_addr(address);
        if (error)
                return error;
 
@@ -2159,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 
        return 0;
 }
-EXPORT_SYMBOL(do_munmap);
 
 int vm_munmap(unsigned long start, size_t len)
 {
@@ -2207,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
        if (!len)
                return addr;
 
-       error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
-       if (error)
-               return error;
-
        flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
 
        error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2563,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm,
        vma->vm_ops = &special_mapping_vmops;
        vma->vm_private_data = pages;
 
-       ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
-       if (ret)
-               goto out;
-
        ret = insert_vm_struct(mm, vma);
        if (ret)
                goto out;
index db8d983b5a7d7a2d6746ccbf74471d2ced3cdd96..21fed202ddad865bb3ee70d07d8ebce17fd37493 100644 (file)
@@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr,
        if ((addr <= new_addr) && (addr+old_len) > new_addr)
                goto out;
 
-       ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-       if (ret)
-               goto out;
-
        ret = do_munmap(mm, new_addr, new_len);
        if (ret)
                goto out;
@@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
  * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
  * This option implies MREMAP_MAYMOVE.
  */
-unsigned long do_mremap(unsigned long addr,
-       unsigned long old_len, unsigned long new_len,
-       unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+               unsigned long, new_len, unsigned long, flags,
+               unsigned long, new_addr)
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
        unsigned long ret = -EINVAL;
        unsigned long charged = 0;
 
+       down_write(&current->mm->mmap_sem);
+
        if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
                goto out;
 
@@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr,
                        goto out;
                }
 
-               ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-               if (ret)
-                       goto out;
                ret = move_vma(vma, addr, old_len, new_len, new_addr);
        }
 out:
        if (ret & ~PAGE_MASK)
                vm_unacct_memory(charged);
-       return ret;
-}
-
-SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
-               unsigned long, new_len, unsigned long, flags,
-               unsigned long, new_addr)
-{
-       unsigned long ret;
-
-       down_write(&current->mm->mmap_sem);
-       ret = do_mremap(addr, old_len, new_len, flags, new_addr);
        up_write(&current->mm->mmap_sem);
        return ret;
 }
index bb8f4f004a82ce57abb0653a9a8ed72d533f5c45..c4acfbc099727b3f5151ed5917961ff59efb7481 100644 (file)
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
                                 unsigned long *_capabilities)
 {
        unsigned long capabilities, rlen;
-       unsigned long reqprot = prot;
        int ret;
 
        /* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
        }
 
        /* allow the security API to have its say */
-       ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+       ret = security_mmap_addr(addr);
        if (ret < 0)
                return ret;
 
@@ -1233,7 +1232,7 @@ enomem:
 /*
  * handle mapping creation for uClinux
  */
-static unsigned long do_mmap_pgoff(struct file *file,
+unsigned long do_mmap_pgoff(struct file *file,
                            unsigned long addr,
                            unsigned long len,
                            unsigned long prot,
@@ -1471,32 +1470,6 @@ error_getting_region:
        return -ENOMEM;
 }
 
-unsigned long do_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       if (unlikely(offset + PAGE_ALIGN(len) < offset))
-               return -EINVAL;
-       if (unlikely(offset & ~PAGE_MASK))
-               return -EINVAL;
-       return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(do_mmap);
-
-unsigned long vm_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       unsigned long ret;
-       struct mm_struct *mm = current->mm;
-
-       down_write(&mm->mmap_sem);
-       ret = do_mmap(file, addr, len, prot, flag, offset);
-       up_write(&mm->mmap_sem);
-       return ret;
-}
-EXPORT_SYMBOL(vm_mmap);
-
 SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                unsigned long, prot, unsigned long, flags,
                unsigned long, fd, unsigned long, pgoff)
@@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
 
        flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
 
-       down_write(&current->mm->mmap_sem);
-       retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-       up_write(&current->mm->mmap_sem);
+       ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
 
        if (file)
                fput(file);
index d576b84d913c40c89232b5709360b6e4abce4dad..585bd220a21ee4e5eefaec2b9c46dc9ae68fde98 100644 (file)
@@ -2439,11 +2439,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
        return dentry;
 }
 
-static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
-                               int connectable)
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
+                               struct inode *parent)
 {
-       struct inode *inode = dentry->d_inode;
-
        if (*len < 3) {
                *len = 3;
                return 255;
index ae962b31de888a55990769aae948bac3ef0db338..8c7265afa29f2109b884907daa050b79f0b25f8b 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
 #include <linux/export.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/security.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
+unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot,
+       unsigned long flag, unsigned long pgoff)
+{
+       unsigned long ret;
+       struct mm_struct *mm = current->mm;
+
+       ret = security_mmap_file(file, prot, flag);
+       if (!ret) {
+               down_write(&mm->mmap_sem);
+               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+               up_write(&mm->mmap_sem);
+       }
+       return ret;
+}
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot,
+       unsigned long flag, unsigned long offset)
+{
+       if (unlikely(offset + PAGE_ALIGN(len) < offset))
+               return -EINVAL;
+       if (unlikely(offset & ~PAGE_MASK))
+               return -EINVAL;
+
+       return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(vm_mmap);
+
 /* Tracepoints definitions. */
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
index 8522a4793374136fa4ab66aa9b325b48019801ac..ca8e0a57d945dabeb51147f338cef363672040d5 100644 (file)
@@ -16,8 +16,6 @@
 #include <net/netlink.h>
 #include <net/pkt_sched.h>
 
-extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
-
 /*
  * The ATM queuing discipline provides a framework for invoking classifiers
  * (aka "filters"), which in turn select classes of this queuing discipline.
index 38f388c39dce89a5e6456514771f70ef975af1c0..107c4528654fd5867b8363ccdf66c648e9202a34 100644 (file)
@@ -381,21 +381,53 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
 }
 
 /*
- * We cannot currently handle tokens with rotated data.  We need a
- * generalized routine to rotate the data in place.  It is anticipated
- * that we won't encounter rotated data in the general case.
+ * We can shift data by up to LOCAL_BUF_LEN bytes in a pass.  If we need
+ * to do more than that, we shift repeatedly.  Kevin Coffman reports
+ * seeing 28 bytes as the value used by Microsoft clients and servers
+ * with AES, so this constant is chosen to allow handling 28 in one pass
+ * without using too much stack space.
+ *
+ * If that proves to a problem perhaps we could use a more clever
+ * algorithm.
  */
-static u32
-rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc)
+#define LOCAL_BUF_LEN 32u
+
+static void rotate_buf_a_little(struct xdr_buf *buf, unsigned int shift)
 {
-       unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN);
+       char head[LOCAL_BUF_LEN];
+       char tmp[LOCAL_BUF_LEN];
+       unsigned int this_len, i;
+
+       BUG_ON(shift > LOCAL_BUF_LEN);
 
-       if (realrrc == 0)
-               return 0;
+       read_bytes_from_xdr_buf(buf, 0, head, shift);
+       for (i = 0; i + shift < buf->len; i += LOCAL_BUF_LEN) {
+               this_len = min(LOCAL_BUF_LEN, buf->len - (i + shift));
+               read_bytes_from_xdr_buf(buf, i+shift, tmp, this_len);
+               write_bytes_to_xdr_buf(buf, i, tmp, this_len);
+       }
+       write_bytes_to_xdr_buf(buf, buf->len - shift, head, shift);
+}
 
-       dprintk("%s: cannot process token with rotated data: "
-               "rrc %u, realrrc %u\n", __func__, rrc, realrrc);
-       return 1;
+static void _rotate_left(struct xdr_buf *buf, unsigned int shift)
+{
+       int shifted = 0;
+       int this_shift;
+
+       shift %= buf->len;
+       while (shifted < shift) {
+               this_shift = min(shift - shifted, LOCAL_BUF_LEN);
+               rotate_buf_a_little(buf, this_shift);
+               shifted += this_shift;
+       }
+}
+
+static void rotate_left(u32 base, struct xdr_buf *buf, unsigned int shift)
+{
+       struct xdr_buf subbuf;
+
+       xdr_buf_subsegment(buf, &subbuf, base, buf->len - base);
+       _rotate_left(&subbuf, shift);
 }
 
 static u32
@@ -495,11 +527,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
 
        seqnum = be64_to_cpup((__be64 *)(ptr + 8));
 
-       if (rrc != 0) {
-               err = rotate_left(kctx, offset, buf, rrc);
-               if (err)
-                       return GSS_S_FAILURE;
-       }
+       if (rrc != 0)
+               rotate_left(offset + 16, buf, rrc);
 
        err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf,
                                        &headskip, &tailskip);
index 3089de37c433157cd45cff21974e0a242743c715..73e95738660042e7a9d4e7cb252143ec74078265 100644 (file)
@@ -336,7 +336,6 @@ struct rsc {
        struct svc_cred         cred;
        struct gss_svc_seq_data seqdata;
        struct gss_ctx          *mechctx;
-       char                    *client_name;
 };
 
 static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old);
@@ -347,9 +346,7 @@ static void rsc_free(struct rsc *rsci)
        kfree(rsci->handle.data);
        if (rsci->mechctx)
                gss_delete_sec_context(&rsci->mechctx);
-       if (rsci->cred.cr_group_info)
-               put_group_info(rsci->cred.cr_group_info);
-       kfree(rsci->client_name);
+       free_svc_cred(&rsci->cred);
 }
 
 static void rsc_put(struct kref *ref)
@@ -387,7 +384,7 @@ rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
        tmp->handle.data = NULL;
        new->mechctx = NULL;
        new->cred.cr_group_info = NULL;
-       new->client_name = NULL;
+       new->cred.cr_principal = NULL;
 }
 
 static void
@@ -402,8 +399,8 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
        spin_lock_init(&new->seqdata.sd_lock);
        new->cred = tmp->cred;
        tmp->cred.cr_group_info = NULL;
-       new->client_name = tmp->client_name;
-       tmp->client_name = NULL;
+       new->cred.cr_principal = tmp->cred.cr_principal;
+       tmp->cred.cr_principal = NULL;
 }
 
 static struct cache_head *
@@ -501,8 +498,8 @@ static int rsc_parse(struct cache_detail *cd,
                /* get client name */
                len = qword_get(&mesg, buf, mlen);
                if (len > 0) {
-                       rsci.client_name = kstrdup(buf, GFP_KERNEL);
-                       if (!rsci.client_name)
+                       rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL);
+                       if (!rsci.cred.cr_principal)
                                goto out;
                }
 
@@ -932,16 +929,6 @@ struct gss_svc_data {
        struct rsc                      *rsci;
 };
 
-char *svc_gss_principal(struct svc_rqst *rqstp)
-{
-       struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data;
-
-       if (gd && gd->rsci)
-               return gd->rsci->client_name;
-       return NULL;
-}
-EXPORT_SYMBOL_GPL(svc_gss_principal);
-
 static int
 svcauth_gss_set_client(struct svc_rqst *rqstp)
 {
@@ -1220,7 +1207,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
                }
                svcdata->rsci = rsci;
                cache_get(&rsci->h);
-               rqstp->rq_flavor = gss_svc_to_pseudoflavor(
+               rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor(
                                        rsci->mechctx->mech_type, gc->gc_svc);
                ret = SVC_OK;
                goto out;
index 3c0653439f3dc398031301d53819c3b6e78bc4ef..92509ffe15fcacce5de331cbb205a84c4f718a86 100644 (file)
@@ -180,14 +180,16 @@ void rpcb_put_local(struct net *net)
        struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
        struct rpc_clnt *clnt = sn->rpcb_local_clnt;
        struct rpc_clnt *clnt4 = sn->rpcb_local_clnt4;
-       int shutdown;
+       int shutdown = 0;
 
        spin_lock(&sn->rpcb_clnt_lock);
-       if (--sn->rpcb_users == 0) {
-               sn->rpcb_local_clnt = NULL;
-               sn->rpcb_local_clnt4 = NULL;
+       if (sn->rpcb_users) {
+               if (--sn->rpcb_users == 0) {
+                       sn->rpcb_local_clnt = NULL;
+                       sn->rpcb_local_clnt4 = NULL;
+               }
+               shutdown = !sn->rpcb_users;
        }
-       shutdown = !sn->rpcb_users;
        spin_unlock(&sn->rpcb_clnt_lock);
 
        if (shutdown) {
index 017c0117d1543a784dfe5130396c74f80879131a..7e9baaa1e543e55878dcb0d9bd0378a0e51754e0 100644 (file)
@@ -407,6 +407,14 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
        return 0;
 }
 
+int svc_bind(struct svc_serv *serv, struct net *net)
+{
+       if (!svc_uses_rpcbind(serv))
+               return 0;
+       return svc_rpcb_setup(serv, net);
+}
+EXPORT_SYMBOL_GPL(svc_bind);
+
 /*
  * Create an RPC service
  */
@@ -471,15 +479,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
                spin_lock_init(&pool->sp_lock);
        }
 
-       if (svc_uses_rpcbind(serv)) {
-               if (svc_rpcb_setup(serv, current->nsproxy->net_ns) < 0) {
-                       kfree(serv->sv_pools);
-                       kfree(serv);
-                       return NULL;
-               }
-               if (!serv->sv_shutdown)
-                       serv->sv_shutdown = svc_rpcb_cleanup;
-       }
+       if (svc_uses_rpcbind(serv) && (!serv->sv_shutdown))
+               serv->sv_shutdown = svc_rpcb_cleanup;
 
        return serv;
 }
@@ -536,8 +537,6 @@ EXPORT_SYMBOL_GPL(svc_shutdown_net);
 void
 svc_destroy(struct svc_serv *serv)
 {
-       struct net *net = current->nsproxy->net_ns;
-
        dprintk("svc: svc_destroy(%s, %d)\n",
                                serv->sv_program->pg_name,
                                serv->sv_nrthreads);
@@ -552,8 +551,6 @@ svc_destroy(struct svc_serv *serv)
 
        del_timer_sync(&serv->sv_temptimer);
 
-       svc_shutdown_net(serv, net);
-
        /*
         * The last user is gone and thus all sockets have to be destroyed to
         * the point. Check this.
index b98ee35149121602b42ace9365bfd5f21e84767a..88f2bf671960d444e73d3d9eba2998f75ac2885b 100644 (file)
@@ -598,6 +598,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
 
        /* now allocate needed pages.  If we get a failure, sleep briefly */
        pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+       BUG_ON(pages >= RPCSVC_MAXPAGES);
        for (i = 0; i < pages ; i++)
                while (rqstp->rq_pages[i] == NULL) {
                        struct page *p = alloc_page(GFP_KERNEL);
@@ -612,7 +613,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
                        rqstp->rq_pages[i] = p;
                }
        rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
-       BUG_ON(pages >= RPCSVC_MAXPAGES);
 
        /* Make arg->head point to first page and arg->pages point to rest */
        arg = &rqstp->rq_arg;
@@ -973,7 +973,7 @@ void svc_close_net(struct svc_serv *serv, struct net *net)
        svc_clear_pools(serv, net);
        /*
         * At this point the sp_sockets lists will stay empty, since
-        * svc_enqueue will not add new entries without taking the
+        * svc_xprt_enqueue will not add new entries without taking the
         * sp_lock and checking XPT_BUSY.
         */
        svc_clear_list(&serv->sv_tempsocks, net);
index 6138c925923d00715cb8695ebd2cfbb719287699..2777fa896645de3f063aa5ad67cb054bbb75a894 100644 (file)
@@ -746,6 +746,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
        struct svc_cred *cred = &rqstp->rq_cred;
 
        cred->cr_group_info = NULL;
+       cred->cr_principal = NULL;
        rqstp->rq_client = NULL;
 
        if (argv->iov_len < 3*4)
@@ -773,7 +774,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
        svc_putnl(resv, RPC_AUTH_NULL);
        svc_putnl(resv, 0);
 
-       rqstp->rq_flavor = RPC_AUTH_NULL;
+       rqstp->rq_cred.cr_flavor = RPC_AUTH_NULL;
        return SVC_OK;
 }
 
@@ -811,6 +812,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
        int             len   = argv->iov_len;
 
        cred->cr_group_info = NULL;
+       cred->cr_principal = NULL;
        rqstp->rq_client = NULL;
 
        if ((len -= 3*4) < 0)
@@ -847,7 +849,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
        svc_putnl(resv, RPC_AUTH_NULL);
        svc_putnl(resv, 0);
 
-       rqstp->rq_flavor = RPC_AUTH_UNIX;
+       rqstp->rq_cred.cr_flavor = RPC_AUTH_UNIX;
        return SVC_OK;
 
 badcred:
index 032daab449b0bb3007562e795593a15d247a2c58..8ea39aabe94889a224c868757196afc084c578b8 100644 (file)
@@ -490,17 +490,9 @@ static int common_mmap(int op, struct file *file, unsigned long prot,
        return common_file_perm(op, file, mask);
 }
 
-static int apparmor_file_mmap(struct file *file, unsigned long reqprot,
-                             unsigned long prot, unsigned long flags,
-                             unsigned long addr, unsigned long addr_only)
+static int apparmor_mmap_file(struct file *file, unsigned long reqprot,
+                             unsigned long prot, unsigned long flags)
 {
-       int rc = 0;
-
-       /* do DAC check */
-       rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
-       if (rc || addr_only)
-               return rc;
-
        return common_mmap(OP_FMMAP, file, prot, flags);
 }
 
@@ -646,7 +638,8 @@ static struct security_operations apparmor_ops = {
        .file_permission =              apparmor_file_permission,
        .file_alloc_security =          apparmor_file_alloc_security,
        .file_free_security =           apparmor_file_free_security,
-       .file_mmap =                    apparmor_file_mmap,
+       .mmap_file =                    apparmor_mmap_file,
+       .mmap_addr =                    cap_mmap_addr,
        .file_mprotect =                apparmor_file_mprotect,
        .file_lock =                    apparmor_file_lock,
 
index fca889676c5e9e5726f6c3136fb8c26cfe85f63c..61095df8b89ac452d50528144a67dca751d4f992 100644 (file)
@@ -949,7 +949,8 @@ void __init security_fixup_ops(struct security_operations *ops)
        set_to_cap_if_null(ops, file_alloc_security);
        set_to_cap_if_null(ops, file_free_security);
        set_to_cap_if_null(ops, file_ioctl);
-       set_to_cap_if_null(ops, file_mmap);
+       set_to_cap_if_null(ops, mmap_addr);
+       set_to_cap_if_null(ops, mmap_file);
        set_to_cap_if_null(ops, file_mprotect);
        set_to_cap_if_null(ops, file_lock);
        set_to_cap_if_null(ops, file_fcntl);
index e771cb1b2d7947f0c85651b38cc7c9c1d3da11d7..6dbae4650abe20208ff66eb27015e21b964d0344 100644 (file)
@@ -958,22 +958,15 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
 }
 
 /*
- * cap_file_mmap - check if able to map given addr
- * @file: unused
- * @reqprot: unused
- * @prot: unused
- * @flags: unused
+ * cap_mmap_addr - check if able to map given addr
  * @addr: address attempting to be mapped
- * @addr_only: unused
  *
  * If the process is attempting to map memory below dac_mmap_min_addr they need
  * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
  * capability security module.  Returns 0 if this mapping should be allowed
  * -EPERM if not.
  */
-int cap_file_mmap(struct file *file, unsigned long reqprot,
-                 unsigned long prot, unsigned long flags,
-                 unsigned long addr, unsigned long addr_only)
+int cap_mmap_addr(unsigned long addr)
 {
        int ret = 0;
 
@@ -986,3 +979,9 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
        }
        return ret;
 }
+
+int cap_mmap_file(struct file *file, unsigned long reqprot,
+                 unsigned long prot, unsigned long flags)
+{
+       return 0;
+}
index 5497a57fba0154a24b1b87835930e6cc685f855b..3efc9b12aef44016201b02eeafcc10140f17a240 100644 (file)
@@ -20,6 +20,9 @@
 #include <linux/ima.h>
 #include <linux/evm.h>
 #include <linux/fsnotify.h>
+#include <linux/mman.h>
+#include <linux/mount.h>
+#include <linux/personality.h>
 #include <net/flow.h>
 
 #define MAX_LSM_EVM_XATTR      2
@@ -657,18 +660,56 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        return security_ops->file_ioctl(file, cmd, arg);
 }
 
-int security_file_mmap(struct file *file, unsigned long reqprot,
-                       unsigned long prot, unsigned long flags,
-                       unsigned long addr, unsigned long addr_only)
+static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
 {
-       int ret;
+       /*
+        * Does we have PROT_READ and does the application expect
+        * it to imply PROT_EXEC?  If not, nothing to talk about...
+        */
+       if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
+               return prot;
+       if (!(current->personality & READ_IMPLIES_EXEC))
+               return prot;
+       /*
+        * if that's an anonymous mapping, let it.
+        */
+       if (!file)
+               return prot | PROT_EXEC;
+       /*
+        * ditto if it's not on noexec mount, except that on !MMU we need
+        * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case
+        */
+       if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
+#ifndef CONFIG_MMU
+               unsigned long caps = 0;
+               struct address_space *mapping = file->f_mapping;
+               if (mapping && mapping->backing_dev_info)
+                       caps = mapping->backing_dev_info->capabilities;
+               if (!(caps & BDI_CAP_EXEC_MAP))
+                       return prot;
+#endif
+               return prot | PROT_EXEC;
+       }
+       /* anything on noexec mount won't get PROT_EXEC */
+       return prot;
+}
 
-       ret = security_ops->file_mmap(file, reqprot, prot, flags, addr, addr_only);
+int security_mmap_file(struct file *file, unsigned long prot,
+                       unsigned long flags)
+{
+       int ret;
+       ret = security_ops->mmap_file(file, prot,
+                                       mmap_prot(file, prot), flags);
        if (ret)
                return ret;
        return ima_file_mmap(file, prot);
 }
 
+int security_mmap_addr(unsigned long addr)
+{
+       return security_ops->mmap_addr(addr);
+}
+
 int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                            unsigned long prot)
 {
index fa2341b683314b0c5505f905e6712538555300ad..372ec6502aa8752dca83c3c507e2d0ce9cac84d1 100644 (file)
@@ -3083,9 +3083,7 @@ error:
        return rc;
 }
 
-static int selinux_file_mmap(struct file *file, unsigned long reqprot,
-                            unsigned long prot, unsigned long flags,
-                            unsigned long addr, unsigned long addr_only)
+static int selinux_mmap_addr(unsigned long addr)
 {
        int rc = 0;
        u32 sid = current_sid();
@@ -3104,10 +3102,12 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
        }
 
        /* do DAC check on address space usage */
-       rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
-       if (rc || addr_only)
-               return rc;
+       return cap_mmap_addr(addr);
+}
 
+static int selinux_mmap_file(struct file *file, unsigned long reqprot,
+                            unsigned long prot, unsigned long flags)
+{
        if (selinux_checkreqprot)
                prot = reqprot;
 
@@ -5570,7 +5570,8 @@ static struct security_operations selinux_ops = {
        .file_alloc_security =          selinux_file_alloc_security,
        .file_free_security =           selinux_file_free_security,
        .file_ioctl =                   selinux_file_ioctl,
-       .file_mmap =                    selinux_file_mmap,
+       .mmap_file =                    selinux_mmap_file,
+       .mmap_addr =                    selinux_mmap_addr,
        .file_mprotect =                selinux_file_mprotect,
        .file_lock =                    selinux_file_lock,
        .file_fcntl =                   selinux_file_fcntl,
index 4e93f9ef970b25a78bca26ab2a49962024b3cb50..3ad2902512888282e299b64434a7790d9788e060 100644 (file)
@@ -1259,12 +1259,8 @@ static int sel_make_bools(void)
                if (!inode)
                        goto out;
 
-               ret = -EINVAL;
-               len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]);
-               if (len < 0)
-                       goto out;
-
                ret = -ENAMETOOLONG;
+               len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]);
                if (len >= PAGE_SIZE)
                        goto out;
 
@@ -1557,19 +1553,10 @@ static inline u32 sel_ino_to_perm(unsigned long ino)
 static ssize_t sel_read_class(struct file *file, char __user *buf,
                                size_t count, loff_t *ppos)
 {
-       ssize_t rc, len;
-       char *page;
        unsigned long ino = file->f_path.dentry->d_inode->i_ino;
-
-       page = (char *)__get_free_page(GFP_KERNEL);
-       if (!page)
-               return -ENOMEM;
-
-       len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_class(ino));
-       rc = simple_read_from_buffer(buf, count, ppos, page, len);
-       free_page((unsigned long)page);
-
-       return rc;
+       char res[TMPBUFLEN];
+       ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_class(ino));
+       return simple_read_from_buffer(buf, count, ppos, res, len);
 }
 
 static const struct file_operations sel_class_ops = {
@@ -1580,19 +1567,10 @@ static const struct file_operations sel_class_ops = {
 static ssize_t sel_read_perm(struct file *file, char __user *buf,
                                size_t count, loff_t *ppos)
 {
-       ssize_t rc, len;
-       char *page;
        unsigned long ino = file->f_path.dentry->d_inode->i_ino;
-
-       page = (char *)__get_free_page(GFP_KERNEL);
-       if (!page)
-               return -ENOMEM;
-
-       len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_perm(ino));
-       rc = simple_read_from_buffer(buf, count, ppos, page, len);
-       free_page((unsigned long)page);
-
-       return rc;
+       char res[TMPBUFLEN];
+       ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_perm(ino));
+       return simple_read_from_buffer(buf, count, ppos, res, len);
 }
 
 static const struct file_operations sel_perm_ops = {
index d583c054580889eff6f4e9080110aad7ae0370d1..ee0bb5735f35c98d6edfa7bb4c9590ef2a234cc4 100644 (file)
@@ -1171,7 +1171,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
 }
 
 /**
- * smack_file_mmap :
+ * smack_mmap_file :
  * Check permissions for a mmap operation.  The @file may be NULL, e.g.
  * if mapping anonymous memory.
  * @file contains the file structure for file to map (may be NULL).
@@ -1180,10 +1180,9 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
  * @flags contains the operational flags.
  * Return 0 if permission is granted.
  */
-static int smack_file_mmap(struct file *file,
+static int smack_mmap_file(struct file *file,
                           unsigned long reqprot, unsigned long prot,
-                          unsigned long flags, unsigned long addr,
-                          unsigned long addr_only)
+                          unsigned long flags)
 {
        struct smack_known *skp;
        struct smack_rule *srp;
@@ -1198,11 +1197,6 @@ static int smack_file_mmap(struct file *file,
        int tmay;
        int rc;
 
-       /* do DAC check on address space usage */
-       rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
-       if (rc || addr_only)
-               return rc;
-
        if (file == NULL || file->f_dentry == NULL)
                return 0;
 
@@ -3482,7 +3476,8 @@ struct security_operations smack_ops = {
        .file_ioctl =                   smack_file_ioctl,
        .file_lock =                    smack_file_lock,
        .file_fcntl =                   smack_file_fcntl,
-       .file_mmap =                    smack_file_mmap,
+       .mmap_file =                    smack_mmap_file,
+       .mmap_addr =                    cap_mmap_addr,
        .file_set_fowner =              smack_file_set_fowner,
        .file_send_sigiotask =          smack_file_send_sigiotask,
        .file_receive =                 smack_file_receive,