Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/signal

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index d449e632e6a09fca5ca0fedd6bdc2cf87eb44a96..8e2da1e06e3b2371eb82ef07105e63ad97d224b6 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -61,6 +61,7 @@ ata *);
         ssize_t (*listxattr) (struct dentry *, char *, size_t);
         int (*removexattr) (struct dentry *, const char *);
         int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
+       void (*update_time)(struct inode *, struct timespec *, int);
  
  locking rules:
         all may block
@@ -87,6 +88,8 @@ getxattr:     no
  listxattr:     no
  removexattr:   yes
  fiemap:                no
+update_time:   no
+
         Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
  victim.
         cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt

index ef19f91a0f12021f5c003f6084711e6b3b4897cc..efd23f4817044ac9d55932bd9476d309a02918dc 100644 (file)
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -363,6 +363,7 @@ struct inode_operations {
         ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
         ssize_t (*listxattr) (struct dentry *, char *, size_t);
         int (*removexattr) (struct dentry *, const char *);
+       void (*update_time)(struct inode *, struct timespec *, int);
  };
  
  Again, all methods are called without any locks being held, unless
@@ -471,6 +472,9 @@ otherwise noted.
    removexattr: called by the VFS to remove an extended attribute from
         a file. This method is called by removexattr(2) system call.
  
+  update_time: called by the VFS to update a specific time or the i_version of
+       an inode.  If this is not defined the VFS will update the inode itself
+       and call mark_inode_dirty_sync.
  
  The Address Space Object
  ========================
diff --git a/arch/alpha/include/asm/posix_types.h b/arch/alpha/include/asm/posix_types.h

index 24779fc95994efb5c4d69e4d507f3cba581570a4..5a8a48320efe9f5c577f9cc8363a4de2a6725559 100644 (file)
--- a/arch/alpha/include/asm/posix_types.h
+++ b/arch/alpha/include/asm/posix_types.h
@@ -10,9 +10,6 @@
  typedef unsigned int   __kernel_ino_t;
  #define __kernel_ino_t __kernel_ino_t
  
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned long  __kernel_sigset_t;      /* at least 32 bits */
  
  #include <asm-generic/posix_types.h>
diff --git a/arch/arm/include/asm/posix_types.h b/arch/arm/include/asm/posix_types.h

index efdf99045d879e240b9bc41f1f0781efea6ac10f..d2de9cbbcd9bcaf6a9e5b76eefac1f8c8eb7b39d 100644 (file)
--- a/arch/arm/include/asm/posix_types.h
+++ b/arch/arm/include/asm/posix_types.h
@@ -22,9 +22,6 @@
  typedef unsigned short         __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short         __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short         __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/avr32/include/asm/posix_types.h b/arch/avr32/include/asm/posix_types.h

index 74667bfc88cc7676e4c43332b40705908c747021..9ba9e749b3f34d7c2760d1d9784ff9ede9528ef3 100644 (file)
--- a/arch/avr32/include/asm/posix_types.h
+++ b/arch/avr32/include/asm/posix_types.h
@@ -17,9 +17,6 @@
  typedef unsigned short  __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short  __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/blackfin/include/asm/posix_types.h b/arch/blackfin/include/asm/posix_types.h

index 41bc1875c4d7fd367bbbea5432b9332a2821557f..1bd3436db6a7b7d080bdf4bcb1a09db621fde1b0 100644 (file)
--- a/arch/blackfin/include/asm/posix_types.h
+++ b/arch/blackfin/include/asm/posix_types.h
@@ -10,9 +10,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned int __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/cris/include/asm/posix_types.h b/arch/cris/include/asm/posix_types.h

index 234891c74e2bbe21d0aed4301a7552200829c101..ce4e517931514fb0f6d52a2ef1af35b39aa19ea0 100644 (file)
--- a/arch/cris/include/asm/posix_types.h
+++ b/arch/cris/include/asm/posix_types.h
@@ -15,9 +15,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short  __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/frv/include/asm/posix_types.h b/arch/frv/include/asm/posix_types.h

index 3f34cb45fbb3fafd24edff901ca8b52d9941532d..fe512af74a5afbb57dbc1490214155cdbbeb220d 100644 (file)
--- a/arch/frv/include/asm/posix_types.h
+++ b/arch/frv/include/asm/posix_types.h
@@ -10,9 +10,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/h8300/include/asm/posix_types.h b/arch/h8300/include/asm/posix_types.h

index bc4c34efb1ad167ccafa90ee8796d605058572a0..91e62ba4c7b02e99cf73893ef04e6a8cf259025a 100644 (file)
--- a/arch/h8300/include/asm/posix_types.h
+++ b/arch/h8300/include/asm/posix_types.h
@@ -10,9 +10,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/ia64/include/asm/posix_types.h b/arch/ia64/include/asm/posix_types.h

index 7323ab9467ebae726473512588d2c8f41d0ceb40..99ee1d6510cfc98a7dc66128fca841af022b133f 100644 (file)
--- a/arch/ia64/include/asm/posix_types.h
+++ b/arch/ia64/include/asm/posix_types.h
@@ -1,9 +1,6 @@
  #ifndef _ASM_IA64_POSIX_TYPES_H
  #define _ASM_IA64_POSIX_TYPES_H
  
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned long  __kernel_sigset_t;      /* at least 32 bits */
  
  #include <asm-generic/posix_types.h>
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c

index f00ba025375d5696d0070bfe640b6f26f554eebd..d7f558c1e7117bfff75a056d4fee9213c6a4b7fb 100644 (file)
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -604,12 +604,6 @@ pfm_unprotect_ctx_ctxsw(pfm_context_t *x, unsigned long f)
         spin_unlock(&(x)->ctx_lock);
  }
  
-static inline unsigned long 
-pfm_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, unsigned long exec)
-{
-       return get_unmapped_area(file, addr, len, pgoff, flags);
-}
-
  /* forward declaration */
  static const struct dentry_operations pfmfs_dentry_operations;
  
@@ -2333,8 +2327,8 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t
         down_write(&task->mm->mmap_sem);
  
         /* find some free area in address space, must have mmap sem held */
-       vma->vm_start = pfm_get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS, 0);
-       if (vma->vm_start == 0UL) {
+       vma->vm_start = get_unmapped_area(NULL, 0, size, 0, MAP_PRIVATE|MAP_ANONYMOUS);
+       if (IS_ERR_VALUE(vma->vm_start)) {
                 DPRINT(("Cannot find unmapped area for size %ld\n", size));
                 up_write(&task->mm->mmap_sem);
                 goto error;
diff --git a/arch/ia64/kernel/sys_ia64.c b/arch/ia64/kernel/sys_ia64.c

index 609d50056a6c7bd9fba2d46b757960b3e8893388..d9439ef2f66187d9e864f91778b8a9022c41577c 100644 (file)
--- a/arch/ia64/kernel/sys_ia64.c
+++ b/arch/ia64/kernel/sys_ia64.c
@@ -171,22 +171,9 @@ asmlinkage unsigned long
  ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags,
              unsigned long new_addr)
  {
-       extern unsigned long do_mremap (unsigned long addr,
-                                       unsigned long old_len,
-                                       unsigned long new_len,
-                                       unsigned long flags,
-                                       unsigned long new_addr);
-
-       down_write(&current->mm->mmap_sem);
-       {
-               addr = do_mremap(addr, old_len, new_len, flags, new_addr);
-       }
-       up_write(&current->mm->mmap_sem);
-
-       if (IS_ERR((void *) addr))
-               return addr;
-
-       force_successful_syscall_return();
+       addr = sys_mremap(addr, old_len, new_len, flags, new_addr);
+       if (!IS_ERR((void *) addr))
+               force_successful_syscall_return();
         return addr;
  }
  
diff --git a/arch/m32r/include/asm/posix_types.h b/arch/m32r/include/asm/posix_types.h

index 0195850e1f88698b7a6c29ffd8b807b9e38a5b53..236de26a409b3f9a3d85df67129d88a488d025d1 100644 (file)
--- a/arch/m32r/include/asm/posix_types.h
+++ b/arch/m32r/include/asm/posix_types.h
@@ -10,9 +10,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/m68k/include/asm/posix_types.h b/arch/m68k/include/asm/posix_types.h

index 6373093be72bb049f37f071468c3b6c73d6daafe..cf4dbf70fdc73f116f95a83c698511fd7b5f4a62 100644 (file)
--- a/arch/m68k/include/asm/posix_types.h
+++ b/arch/m68k/include/asm/posix_types.h
@@ -10,9 +10,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/mips/include/asm/posix_types.h b/arch/mips/include/asm/posix_types.h

index e0308dcca1358f6f2db6161486299de11d61dde5..fa03ec3fbf897a4c3271d8a38025f383c82760cc 100644 (file)
--- a/arch/mips/include/asm/posix_types.h
+++ b/arch/mips/include/asm/posix_types.h
@@ -17,11 +17,6 @@
   * assume GCC is being used.
   */
  
-#if (_MIPS_SZLONG == 64)
-typedef unsigned int   __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-#endif
-
  typedef long           __kernel_daddr_t;
  #define __kernel_daddr_t __kernel_daddr_t
  
diff --git a/arch/mips/include/asm/stat.h b/arch/mips/include/asm/stat.h

index 6e00f751ab6dc675b886d736e1f0c814136cfbd4..fe9a4c3ec5a1f2d9f557adf7c32348b0c892e374 100644 (file)
--- a/arch/mips/include/asm/stat.h
+++ b/arch/mips/include/asm/stat.h
@@ -20,7 +20,7 @@ struct stat {
         long            st_pad1[3];             /* Reserved for network id */
         ino_t           st_ino;
         mode_t          st_mode;
-       nlink_t         st_nlink;
+       __u32           st_nlink;
         uid_t           st_uid;
         gid_t           st_gid;
         unsigned        st_rdev;
@@ -55,7 +55,7 @@ struct stat64 {
         unsigned long long      st_ino;
  
         mode_t          st_mode;
-       nlink_t         st_nlink;
+       __u32           st_nlink;
  
         uid_t           st_uid;
         gid_t           st_gid;
@@ -96,7 +96,7 @@ struct stat {
         unsigned long           st_ino;
  
         mode_t                  st_mode;
-       nlink_t                 st_nlink;
+       __u32                   st_nlink;
  
         uid_t                   st_uid;
         gid_t                   st_gid;
diff --git a/arch/mn10300/include/asm/posix_types.h b/arch/mn10300/include/asm/posix_types.h

index ab506181ec3108ad98c2db77d73f01e2e2b9b134..d31eeea480cfdda8a4231351abbf61c76da08a5f 100644 (file)
--- a/arch/mn10300/include/asm/posix_types.h
+++ b/arch/mn10300/include/asm/posix_types.h
@@ -20,9 +20,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/parisc/include/asm/posix_types.h b/arch/parisc/include/asm/posix_types.h

index 5212b0357daf15aaf454b751b0eb146fc47b34ac..b9344256f76b365db1c6a9a99b08b337f706970c 100644 (file)
--- a/arch/parisc/include/asm/posix_types.h
+++ b/arch/parisc/include/asm/posix_types.h
@@ -10,9 +10,6 @@
  typedef unsigned short         __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short         __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short         __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/arch/parisc/include/asm/stat.h b/arch/parisc/include/asm/stat.h

index 9d5fbbc5c31f14df4791b1005e83ade0cd0505f4..d76fbda5d62c0437f5fb52c389e144597f054b12 100644 (file)
--- a/arch/parisc/include/asm/stat.h
+++ b/arch/parisc/include/asm/stat.h
@@ -7,7 +7,7 @@ struct stat {
         unsigned int    st_dev;         /* dev_t is 32 bits on parisc */
         ino_t           st_ino;         /* 32 bits */
         mode_t          st_mode;        /* 16 bits */
-       nlink_t         st_nlink;       /* 16 bits */
+       unsigned short  st_nlink;       /* 16 bits */
         unsigned short  st_reserved1;   /* old st_uid */
         unsigned short  st_reserved2;   /* old st_gid */
         unsigned int    st_rdev;
@@ -42,7 +42,7 @@ struct hpux_stat64 {
         unsigned int    st_dev;         /* dev_t is 32 bits on parisc */
         ino_t           st_ino;         /* 32 bits */
         mode_t          st_mode;        /* 16 bits */
-       nlink_t         st_nlink;       /* 16 bits */
+       unsigned short  st_nlink;       /* 16 bits */
         unsigned short  st_reserved1;   /* old st_uid */
         unsigned short  st_reserved2;   /* old st_gid */
         unsigned int    st_rdev;
diff --git a/arch/powerpc/include/asm/posix_types.h b/arch/powerpc/include/asm/posix_types.h

index f1393252bbdad837c97b794c8534328a9c912ee3..2958c5b97b2dd4100ac5907129b4736d94458cf7 100644 (file)
--- a/arch/powerpc/include/asm/posix_types.h
+++ b/arch/powerpc/include/asm/posix_types.h
@@ -16,9 +16,6 @@ typedef int           __kernel_ssize_t;
  typedef long           __kernel_ptrdiff_t;
  #define __kernel_size_t __kernel_size_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef short          __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  #endif
diff --git a/arch/powerpc/include/asm/stat.h b/arch/powerpc/include/asm/stat.h

index e4edc510b530cfed6420e96012a69dc424a34e79..10cfb558e0fd7d1a82df840dd67c3578be1cbe52 100644 (file)
--- a/arch/powerpc/include/asm/stat.h
+++ b/arch/powerpc/include/asm/stat.h
@@ -30,11 +30,11 @@ struct stat {
         unsigned long   st_dev;
         ino_t           st_ino;
  #ifdef __powerpc64__
-       nlink_t         st_nlink;
+       unsigned short  st_nlink;
         mode_t          st_mode;
  #else
         mode_t          st_mode;
-       nlink_t         st_nlink;
+       unsigned short  st_nlink;
  #endif
         uid_t           st_uid;
         gid_t           st_gid;
diff --git a/arch/s390/include/asm/posix_types.h b/arch/s390/include/asm/posix_types.h

index edf8527ff08d9bdf9e5f0468e24faebfdd372c34..7be104c0f19230e157d569efb9a18002dfa4709d 100644 (file)
--- a/arch/s390/include/asm/posix_types.h
+++ b/arch/s390/include/asm/posix_types.h
@@ -24,7 +24,6 @@ typedef unsigned short        __kernel_old_dev_t;
  
  typedef unsigned long   __kernel_ino_t;
  typedef unsigned short  __kernel_mode_t;
-typedef unsigned short  __kernel_nlink_t;
  typedef unsigned short  __kernel_ipc_pid_t;
  typedef unsigned short  __kernel_uid_t;
  typedef unsigned short  __kernel_gid_t;
@@ -35,7 +34,6 @@ typedef int             __kernel_ptrdiff_t;
  
  typedef unsigned int    __kernel_ino_t;
  typedef unsigned int    __kernel_mode_t;
-typedef unsigned int    __kernel_nlink_t;
  typedef int             __kernel_ipc_pid_t;
  typedef unsigned int    __kernel_uid_t;
  typedef unsigned int    __kernel_gid_t;
@@ -47,7 +45,6 @@ typedef unsigned long   __kernel_sigset_t;      /* at least 32 bits */
  
  #define __kernel_ino_t  __kernel_ino_t
  #define __kernel_mode_t __kernel_mode_t
-#define __kernel_nlink_t __kernel_nlink_t
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  #define __kernel_uid_t __kernel_uid_t
  #define __kernel_gid_t __kernel_gid_t
diff --git a/arch/sh/include/asm/posix_types_32.h b/arch/sh/include/asm/posix_types_32.h

index abda58467ece9e86ff1249029bcc7143281b2f04..ba0bdc423b072fa62f74fbc64e1bc5b683f2af7d 100644 (file)
--- a/arch/sh/include/asm/posix_types_32.h
+++ b/arch/sh/include/asm/posix_types_32.h
@@ -3,8 +3,6 @@
  
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  typedef unsigned short __kernel_uid_t;
diff --git a/arch/sh/include/asm/posix_types_64.h b/arch/sh/include/asm/posix_types_64.h

index fcda07b4a616be8196f105ce5d2faee8682c9af1..244f7e950e176b0cbdc907f70b4fdf88572b08f0 100644 (file)
--- a/arch/sh/include/asm/posix_types_64.h
+++ b/arch/sh/include/asm/posix_types_64.h
@@ -3,8 +3,6 @@
  
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  typedef unsigned short __kernel_uid_t;
diff --git a/arch/sparc/include/asm/posix_types.h b/arch/sparc/include/asm/posix_types.h

index 3070f25ae90a3e235eaaf2373949226ea83acfb6..156220ed99eb7dfbfe8696ea9f04da84fc00ff5f 100644 (file)
--- a/arch/sparc/include/asm/posix_types.h
+++ b/arch/sparc/include/asm/posix_types.h
@@ -9,8 +9,6 @@
  
  #if defined(__sparc__) && defined(__arch64__)
  /* sparc 64 bit */
-typedef unsigned int           __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
  
  typedef unsigned short                __kernel_old_uid_t;
  typedef unsigned short         __kernel_old_gid_t;
@@ -38,9 +36,6 @@ typedef unsigned short         __kernel_gid_t;
  typedef unsigned short         __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef short                  __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef long                   __kernel_daddr_t;
  #define __kernel_daddr_t __kernel_daddr_t
  
diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c

index 3ee51f189a55297b0babeb1f54d0b40af97de6f8..275f74fd6f6a3f16fdd4e5fae291af2a364075c0 100644 (file)
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -580,16 +580,9 @@ SYSCALL_DEFINE5(64_mremap, unsigned long, addr,    unsigned long, old_len,
                 unsigned long, new_len, unsigned long, flags,
                 unsigned long, new_addr)
  {
-       unsigned long ret = -EINVAL;
-
         if (test_thread_flag(TIF_32BIT))
-               goto out;
-
-       down_write(&current->mm->mmap_sem);
-       ret = do_mremap(addr, old_len, new_len, flags, new_addr);
-       up_write(&current->mm->mmap_sem);
-out:
-       return ret;       
+               return -EINVAL;
+       return sys_mremap(addr, old_len, new_len, flags, new_addr);
  }
  
  /* we come to here via sys_nis_syscall so it can setup the regs argument */
diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h

index 69adc08d36a52541b1be754dd7c809824b2d707a..6e74450ff0a110afc32901e273d74a77c80f8ca4 100644 (file)
--- a/arch/tile/include/asm/compat.h
+++ b/arch/tile/include/asm/compat.h
@@ -44,7 +44,6 @@ typedef __kernel_uid32_t __compat_gid32_t;
  typedef __kernel_mode_t compat_mode_t;
  typedef __kernel_dev_t compat_dev_t;
  typedef __kernel_loff_t compat_loff_t;
-typedef __kernel_nlink_t compat_nlink_t;
  typedef __kernel_ipc_pid_t compat_ipc_pid_t;
  typedef __kernel_daddr_t compat_daddr_t;
  typedef __kernel_fsid_t        compat_fsid_t;
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h

index 99f262e04b91b6d553fd65bd61957bd9cb5cbd15..8e525059e7d81c0a4cd46dfa2f62695daba80fee 100644 (file)
--- a/arch/x86/include/asm/posix_types_32.h
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -10,9 +10,6 @@
  typedef unsigned short __kernel_mode_t;
  #define __kernel_mode_t __kernel_mode_t
  
-typedef unsigned short __kernel_nlink_t;
-#define __kernel_nlink_t __kernel_nlink_t
-
  typedef unsigned short __kernel_ipc_pid_t;
  #define __kernel_ipc_pid_t __kernel_ipc_pid_t
  
diff --git a/drivers/base/soc.c b/drivers/base/soc.c

index ba29b2e73d48936ab9a93a028abd0b0d9cd29691..72b5e7280d14792e6d83f3a59e4d4793d20fe28c 100644 (file)
--- a/drivers/base/soc.c
+++ b/drivers/base/soc.c
@@ -42,7 +42,7 @@ struct device *soc_device_to_device(struct soc_device *soc_dev)
         return &soc_dev->dev;
  }
  
-static mode_t soc_attribute_mode(struct kobject *kobj,
+static umode_t soc_attribute_mode(struct kobject *kobj,
                                   struct attribute *attr,
                                   int index)
  {
diff --git a/drivers/gpu/drm/i810/i810_dma.c b/drivers/gpu/drm/i810/i810_dma.c

index f920fb5e42b63846e3d8b7b782b492e547e18eef..fa9439159ebd6bc85cdf4e27a307d9cde12dcbd6 100644 (file)
--- a/drivers/gpu/drm/i810/i810_dma.c
+++ b/drivers/gpu/drm/i810/i810_dma.c
@@ -130,11 +130,10 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
                 return -EINVAL;
  
         /* This is all entirely broken */
-       down_write(&current->mm->mmap_sem);
         old_fops = file_priv->filp->f_op;
         file_priv->filp->f_op = &i810_buffer_fops;
         dev_priv->mmap_buffer = buf;
-       buf_priv->virtual = (void *)do_mmap(file_priv->filp, 0, buf->total,
+       buf_priv->virtual = (void *)vm_mmap(file_priv->filp, 0, buf->total,
                                             PROT_READ | PROT_WRITE,
                                             MAP_SHARED, buf->bus_address);
         dev_priv->mmap_buffer = NULL;
@@ -145,7 +144,6 @@ static int i810_map_buffer(struct drm_buf *buf, struct drm_file *file_priv)
                 retcode = PTR_ERR(buf_priv->virtual);
                 buf_priv->virtual = NULL;
         }
-       up_write(&current->mm->mmap_sem);
  
         return retcode;
  }
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c

index a1e6c990cd410efded55c826f03bc5db13839d75..e3dd2a1e2bfc18e47abae82bce7ee60238527c08 100644 (file)
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -68,24 +68,6 @@ static gid_t v9fs_get_fsgid_for_create(struct inode *dir_inode)
         return current_fsgid();
  }
  
-/**
- * v9fs_dentry_from_dir_inode - helper function to get the dentry from
- * dir inode.
- *
- */
-
-static struct dentry *v9fs_dentry_from_dir_inode(struct inode *inode)
-{
-       struct dentry *dentry;
-
-       spin_lock(&inode->i_lock);
-       /* Directory should have only one entry. */
-       BUG_ON(S_ISDIR(inode->i_mode) && !list_is_singular(&inode->i_dentry));
-       dentry = list_entry(inode->i_dentry.next, struct dentry, d_alias);
-       spin_unlock(&inode->i_lock);
-       return dentry;
-}
-
  static int v9fs_test_inode_dotl(struct inode *inode, void *data)
  {
         struct v9fs_inode *v9inode = V9FS_I(inode);
@@ -415,7 +397,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir,
         if (dir->i_mode & S_ISGID)
                 omode |= S_ISGID;
  
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
         dfid = v9fs_fid_lookup(dir_dentry);
         if (IS_ERR(dfid)) {
                 err = PTR_ERR(dfid);
@@ -793,7 +775,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir,
                  dir->i_ino, old_dentry->d_name.name, dentry->d_name.name);
  
         v9ses = v9fs_inode2v9ses(dir);
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
         dfid = v9fs_fid_lookup(dir_dentry);
         if (IS_ERR(dfid))
                 return PTR_ERR(dfid);
@@ -858,7 +840,7 @@ v9fs_vfs_mknod_dotl(struct inode *dir, struct dentry *dentry, umode_t omode,
                 return -EINVAL;
  
         v9ses = v9fs_inode2v9ses(dir);
-       dir_dentry = v9fs_dentry_from_dir_inode(dir);
+       dir_dentry = dentry->d_parent;
         dfid = v9fs_fid_lookup(dir_dentry);
         if (IS_ERR(dfid)) {
                 err = PTR_ERR(dfid);
diff --git a/fs/affs/affs.h b/fs/affs/affs.h

index 45a0ce45d7b46afa94b1290511bc1f91a9872137..1fceb320d2f22c16bc1a900cb27597d68977dbbd 100644 (file)
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -18,14 +18,6 @@
  #define AFFS_GET_HASHENTRY(data,hashkey) be32_to_cpu(((struct dir_front *)data)->hashtable[hashkey])
  #define AFFS_BLOCK(sb, bh, blk)                (AFFS_HEAD(bh)->table[AFFS_SB(sb)->s_hashsize-1-(blk)])
  
-#ifdef __LITTLE_ENDIAN
-#define BO_EXBITS      0x18UL
-#elif defined(__BIG_ENDIAN)
-#define BO_EXBITS      0x00UL
-#else
-#error Endianness must be known for affs to work.
-#endif
-
  #define AFFS_HEAD(bh)          ((struct affs_head *)(bh)->b_data)
  #define AFFS_TAIL(sb, bh)      ((struct affs_tail *)((bh)->b_data+(sb)->s_blocksize-sizeof(struct affs_tail)))
  #define AFFS_ROOT_HEAD(bh)     ((struct affs_root_head *)(bh)->b_data)
diff --git a/fs/aio.c b/fs/aio.c

index 8c7c8b805372094cc5072b7cd49b5e771ab507c4..55c4c76560537f7fe72d6ff5f429eff666b86789 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -134,9 +134,9 @@ static int aio_setup_ring(struct kioctx *ctx)
         info->mmap_size = nr_pages * PAGE_SIZE;
         dprintk("attempting mmap of %lu bytes\n", info->mmap_size);
         down_write(&ctx->mm->mmap_sem);
-       info->mmap_base = do_mmap(NULL, 0, info->mmap_size, 
-                                 PROT_READ|PROT_WRITE, MAP_ANONYMOUS|MAP_PRIVATE,
-                                 0);
+       info->mmap_base = do_mmap_pgoff(NULL, 0, info->mmap_size, 
+                                       PROT_READ|PROT_WRITE,
+                                       MAP_ANONYMOUS|MAP_PRIVATE, 0);
         if (IS_ERR((void *)info->mmap_base)) {
                 up_write(&ctx->mm->mmap_sem);
                 info->mmap_size = 0;
diff --git a/fs/attr.c b/fs/attr.c

index 584620e5dee52b5be4a456fb0572a5227a0ef534..0da90951d2776f827a905337938399ada79e8e69 100644 (file)
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -176,6 +176,11 @@ int notify_change(struct dentry * dentry, struct iattr * attr)
                         return -EPERM;
         }
  
+       if ((ia_valid & ATTR_SIZE) && IS_I_VERSION(inode)) {
+               if (attr->ia_size != inode->i_size)
+                       inode_inc_iversion(inode);
+       }
+
         if ((ia_valid & ATTR_MODE)) {
                 umode_t amode = attr->ia_mode;
                 /* Flag setting protected by i_mutex */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c

index e658dd134b95fb375b371a931e739baa95d249a8..1b52956afe33ab07889c3963ce2c41b32133483b 100644 (file)
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -329,7 +329,6 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
         if (!size)
                 return addr;
  
-       down_write(&current->mm->mmap_sem);
         /*
         * total_size is the size of the ELF (interpreter) image.
         * The _first_ mmap needs to know the full size, otherwise
@@ -340,13 +339,12 @@ static unsigned long elf_map(struct file *filep, unsigned long addr,
         */
         if (total_size) {
                 total_size = ELF_PAGEALIGN(total_size);
-               map_addr = do_mmap(filep, addr, total_size, prot, type, off);
+               map_addr = vm_mmap(filep, addr, total_size, prot, type, off);
                 if (!BAD_ADDR(map_addr))
-                       do_munmap(current->mm, map_addr+size, total_size-size);
+                       vm_munmap(map_addr+size, total_size-size);
         } else
-               map_addr = do_mmap(filep, addr, size, prot, type, off);
+               map_addr = vm_mmap(filep, addr, size, prot, type, off);
  
-       up_write(&current->mm->mmap_sem);
         return(map_addr);
  }
  
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c

index 6b2daf99fab8bcd91d314f0abd951b8472a092d2..178cb70acc26de80ec3db21a8455e88b7fc0360b 100644 (file)
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -562,7 +562,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                                 realdatastart = (unsigned long) -ENOMEM;
                         printk("Unable to allocate RAM for process data, errno %d\n",
                                         (int)-realdatastart);
-                       do_munmap(current->mm, textpos, text_len);
+                       vm_munmap(textpos, text_len);
                         ret = realdatastart;
                         goto err;
                 }
@@ -586,8 +586,8 @@ static int load_flat_file(struct linux_binprm * bprm,
                 }
                 if (IS_ERR_VALUE(result)) {
                         printk("Unable to read data+bss, errno %d\n", (int)-result);
-                       do_munmap(current->mm, textpos, text_len);
-                       do_munmap(current->mm, realdatastart, len);
+                       vm_munmap(textpos, text_len);
+                       vm_munmap(realdatastart, len);
                         ret = result;
                         goto err;
                 }
@@ -654,7 +654,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                 }
                 if (IS_ERR_VALUE(result)) {
                         printk("Unable to read code+data+bss, errno %d\n",(int)-result);
-                       do_munmap(current->mm, textpos, text_len + data_len + extra +
+                       vm_munmap(textpos, text_len + data_len + extra +
                                 MAX_SHARED_LIBS * sizeof(unsigned long));
                         ret = result;
                         goto err;
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c

index 89b156d85d63c9f29b66413e1558e85a758d0e12..761e2cd8fed16e6046951e50504b8bb9e7acd3e4 100644 (file)
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -227,7 +227,11 @@ int btrfs_init_acl(struct btrfs_trans_handle *trans,
                 if (ret > 0) {
                         /* we need an acl */
                         ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
+               } else {
+                       cache_no_acl(inode);
                 }
+       } else {
+               cache_no_acl(inode);
         }
  failed:
         posix_acl_release(acl);
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c

index bcec06750232e6cc3de09c62648201547709222b..3f75895c919bcc3b80ae63ab1fca42dcf335f95b 100644 (file)
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -24,22 +24,135 @@
  #include "delayed-ref.h"
  #include "locking.h"
  
+struct extent_inode_elem {
+       u64 inum;
+       u64 offset;
+       struct extent_inode_elem *next;
+};
+
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+                               struct btrfs_file_extent_item *fi,
+                               u64 extent_item_pos,
+                               struct extent_inode_elem **eie)
+{
+       u64 data_offset;
+       u64 data_len;
+       struct extent_inode_elem *e;
+
+       data_offset = btrfs_file_extent_offset(eb, fi);
+       data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+       if (extent_item_pos < data_offset ||
+           extent_item_pos >= data_offset + data_len)
+               return 1;
+
+       e = kmalloc(sizeof(*e), GFP_NOFS);
+       if (!e)
+               return -ENOMEM;
+
+       e->next = *eie;
+       e->inum = key->objectid;
+       e->offset = key->offset + (extent_item_pos - data_offset);
+       *eie = e;
+
+       return 0;
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+                               u64 extent_item_pos,
+                               struct extent_inode_elem **eie)
+{
+       u64 disk_byte;
+       struct btrfs_key key;
+       struct btrfs_file_extent_item *fi;
+       int slot;
+       int nritems;
+       int extent_type;
+       int ret;
+
+       /*
+        * from the shared data ref, we only have the leaf but we need
+        * the key. thus, we must look into all items and see that we
+        * find one (some) with a reference to our extent item.
+        */
+       nritems = btrfs_header_nritems(eb);
+       for (slot = 0; slot < nritems; ++slot) {
+               btrfs_item_key_to_cpu(eb, &key, slot);
+               if (key.type != BTRFS_EXTENT_DATA_KEY)
+                       continue;
+               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+               extent_type = btrfs_file_extent_type(eb, fi);
+               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+                       continue;
+               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+               disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+               if (disk_byte != wanted_disk_byte)
+                       continue;
+
+               ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+               if (ret < 0)
+                       return ret;
+       }
+
+       return 0;
+}
+
  /*
   * this structure records all encountered refs on the way up to the root
   */
  struct __prelim_ref {
         struct list_head list;
         u64 root_id;
-       struct btrfs_key key;
+       struct btrfs_key key_for_search;
         int level;
         int count;
+       struct extent_inode_elem *inode_list;
         u64 parent;
         u64 wanted_disk_byte;
  };
  
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ *   block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    -   |     -
+ *      key to resolve |    -   |     y    |    y   |     y
+ *  tree block logical |    -   |     -    |    -   |     -
+ *  root for resolving |    y   |     y    |    y   |     y
+ *
+ * - column 1:       we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    y   |     -
+ *      key to resolve |    -   |     -    |    -   |     y
+ *  tree block logical |    y   |     y    |    y   |     y
+ *  root for resolving |    -   |     y    |    y   |     y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2:    we take the first key from the block to find the parent
+ *                (see __add_missing_keys)
+ * - column 4:    we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
  static int __add_prelim_ref(struct list_head *head, u64 root_id,
-                           struct btrfs_key *key, int level, u64 parent,
-                           u64 wanted_disk_byte, int count)
+                           struct btrfs_key *key, int level,
+                           u64 parent, u64 wanted_disk_byte, int count)
  {
         struct __prelim_ref *ref;
  
@@ -50,10 +163,11 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
  
         ref->root_id = root_id;
         if (key)
-               ref->key = *key;
+               ref->key_for_search = *key;
         else
-               memset(&ref->key, 0, sizeof(ref->key));
+               memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
  
+       ref->inode_list = NULL;
         ref->level = level;
         ref->count = count;
         ref->parent = parent;
@@ -64,18 +178,26 @@ static int __add_prelim_ref(struct list_head *head, u64 root_id,
  }
  
  static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
-                               struct ulist *parents,
-                               struct extent_buffer *eb, int level,
-                               u64 wanted_objectid, u64 wanted_disk_byte)
+                               struct ulist *parents, int level,
+                               struct btrfs_key *key, u64 wanted_disk_byte,
+                               const u64 *extent_item_pos)
  {
         int ret;
-       int slot;
+       int slot = path->slots[level];
+       struct extent_buffer *eb = path->nodes[level];
         struct btrfs_file_extent_item *fi;
-       struct btrfs_key key;
+       struct extent_inode_elem *eie = NULL;
         u64 disk_byte;
+       u64 wanted_objectid = key->objectid;
  
  add_parent:
-       ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+       if (level == 0 && extent_item_pos) {
+               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+               ret = check_extent_in_eb(key, eb, fi, *extent_item_pos, &eie);
+               if (ret < 0)
+                       return ret;
+       }
+       ret = ulist_add(parents, eb->start, (unsigned long)eie, GFP_NOFS);
         if (ret < 0)
                 return ret;
  
@@ -89,6 +211,7 @@ add_parent:
          * repeat this until we don't find any additional EXTENT_DATA items.
          */
         while (1) {
+               eie = NULL;
                 ret = btrfs_next_leaf(root, path);
                 if (ret < 0)
                         return ret;
@@ -97,9 +220,9 @@ add_parent:
  
                 eb = path->nodes[0];
                 for (slot = 0; slot < btrfs_header_nritems(eb); ++slot) {
-                       btrfs_item_key_to_cpu(eb, &key, slot);
-                       if (key.objectid != wanted_objectid ||
-                           key.type != BTRFS_EXTENT_DATA_KEY)
+                       btrfs_item_key_to_cpu(eb, key, slot);
+                       if (key->objectid != wanted_objectid ||
+                           key->type != BTRFS_EXTENT_DATA_KEY)
                                 return 0;
                         fi = btrfs_item_ptr(eb, slot,
                                                 struct btrfs_file_extent_item);
@@ -118,8 +241,10 @@ add_parent:
   */
  static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                                         int search_commit_root,
+                                       u64 time_seq,
                                         struct __prelim_ref *ref,
-                                       struct ulist *parents)
+                                       struct ulist *parents,
+                                       const u64 *extent_item_pos)
  {
         struct btrfs_path *path;
         struct btrfs_root *root;
@@ -152,12 +277,13 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                 goto out;
  
         path->lowest_level = level;
-       ret = btrfs_search_slot(NULL, root, &ref->key, path, 0, 0);
+       ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
         pr_debug("search slot in root %llu (level %d, ref count %d) returned "
                  "%d for key (%llu %u %llu)\n",
                  (unsigned long long)ref->root_id, level, ref->count, ret,
-                (unsigned long long)ref->key.objectid, ref->key.type,
-                (unsigned long long)ref->key.offset);
+                (unsigned long long)ref->key_for_search.objectid,
+                ref->key_for_search.type,
+                (unsigned long long)ref->key_for_search.offset);
         if (ret < 0)
                 goto out;
  
@@ -179,9 +305,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
                 btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
         }
  
-       /* the last two parameters will only be used for level == 0 */
-       ret = add_all_parents(root, path, parents, eb, level, key.objectid,
-                               ref->wanted_disk_byte);
+       ret = add_all_parents(root, path, parents, level, &key,
+                               ref->wanted_disk_byte, extent_item_pos);
  out:
         btrfs_free_path(path);
         return ret;
@@ -191,8 +316,9 @@ out:
   * resolve all indirect backrefs from the list
   */
  static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
-                                  int search_commit_root,
-                                  struct list_head *head)
+                                  int search_commit_root, u64 time_seq,
+                                  struct list_head *head,
+                                  const u64 *extent_item_pos)
  {
         int err;
         int ret = 0;
@@ -201,6 +327,7 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
         struct __prelim_ref *new_ref;
         struct ulist *parents;
         struct ulist_node *node;
+       struct ulist_iterator uiter;
  
         parents = ulist_alloc(GFP_NOFS);
         if (!parents)
@@ -217,7 +344,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                 if (ref->count == 0)
                         continue;
                 err = __resolve_indirect_ref(fs_info, search_commit_root,
-                                            ref, parents);
+                                            time_seq, ref, parents,
+                                            extent_item_pos);
                 if (err) {
                         if (ret == 0)
                                 ret = err;
@@ -225,11 +353,14 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                 }
  
                 /* we put the first parent into the ref at hand */
-               node = ulist_next(parents, NULL);
+               ULIST_ITER_INIT(&uiter);
+               node = ulist_next(parents, &uiter);
                 ref->parent = node ? node->val : 0;
+               ref->inode_list =
+                       node ? (struct extent_inode_elem *)node->aux : 0;
  
                 /* additional parents require new refs being added here */
-               while ((node = ulist_next(parents, node))) {
+               while ((node = ulist_next(parents, &uiter))) {
                         new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
                         if (!new_ref) {
                                 ret = -ENOMEM;
@@ -237,6 +368,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                         }
                         memcpy(new_ref, ref, sizeof(*ref));
                         new_ref->parent = node->val;
+                       new_ref->inode_list =
+                                       (struct extent_inode_elem *)node->aux;
                         list_add(&new_ref->list, &ref->list);
                 }
                 ulist_reinit(parents);
@@ -246,10 +379,65 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
         return ret;
  }
  
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+                                    struct __prelim_ref *ref2)
+{
+       if (ref1->level != ref2->level)
+               return 0;
+       if (ref1->root_id != ref2->root_id)
+               return 0;
+       if (ref1->key_for_search.type != ref2->key_for_search.type)
+               return 0;
+       if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+               return 0;
+       if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+               return 0;
+       if (ref1->parent != ref2->parent)
+               return 0;
+
+       return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+                             struct list_head *head)
+{
+       struct list_head *pos;
+       struct extent_buffer *eb;
+
+       list_for_each(pos, head) {
+               struct __prelim_ref *ref;
+               ref = list_entry(pos, struct __prelim_ref, list);
+
+               if (ref->parent)
+                       continue;
+               if (ref->key_for_search.type)
+                       continue;
+               BUG_ON(!ref->wanted_disk_byte);
+               eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+                                    fs_info->tree_root->leafsize, 0);
+               BUG_ON(!eb);
+               btrfs_tree_read_lock(eb);
+               if (btrfs_header_level(eb) == 0)
+                       btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+               else
+                       btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+               btrfs_tree_read_unlock(eb);
+               free_extent_buffer(eb);
+       }
+       return 0;
+}
+
  /*
   * merge two lists of backrefs and adjust counts accordingly
   *
   * mode = 1: merge identical keys, if key is set
+ *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ *           additionally, we could even add a key range for the blocks we
+ *           looked into to merge even more (-> replace unresolved refs by those
+ *           having a parent).
   * mode = 2: merge identical parents
   */
  static int __merge_refs(struct list_head *head, int mode)
@@ -263,20 +451,21 @@ static int __merge_refs(struct list_head *head, int mode)
  
                 ref1 = list_entry(pos1, struct __prelim_ref, list);
  
-               if (mode == 1 && ref1->key.type == 0)
-                       continue;
                 for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
                      pos2 = n2, n2 = pos2->next) {
                         struct __prelim_ref *ref2;
+                       struct __prelim_ref *xchg;
  
                         ref2 = list_entry(pos2, struct __prelim_ref, list);
  
                         if (mode == 1) {
-                               if (memcmp(&ref1->key, &ref2->key,
-                                          sizeof(ref1->key)) ||
-                                   ref1->level != ref2->level ||
-                                   ref1->root_id != ref2->root_id)
+                               if (!ref_for_same_block(ref1, ref2))
                                         continue;
+                               if (!ref1->parent && ref2->parent) {
+                                       xchg = ref1;
+                                       ref1 = ref2;
+                                       ref2 = xchg;
+                               }
                                 ref1->count += ref2->count;
                         } else {
                                 if (ref1->parent != ref2->parent)
@@ -296,16 +485,17 @@ static int __merge_refs(struct list_head *head, int mode)
   * smaller or equal that seq to the list
   */
  static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
-                             struct btrfs_key *info_key,
                               struct list_head *prefs)
  {
         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
         struct rb_node *n = &head->node.rb_node;
+       struct btrfs_key key;
+       struct btrfs_key op_key = {0};
         int sgn;
         int ret = 0;
  
         if (extent_op && extent_op->update_key)
-               btrfs_disk_key_to_cpu(info_key, &extent_op->key);
+               btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
  
         while ((n = rb_prev(n))) {
                 struct btrfs_delayed_ref_node *node;
@@ -337,7 +527,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                         struct btrfs_delayed_tree_ref *ref;
  
                         ref = btrfs_delayed_node_to_tree_ref(node);
-                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                       ret = __add_prelim_ref(prefs, ref->root, &op_key,
                                                ref->level + 1, 0, node->bytenr,
                                                node->ref_mod * sgn);
                         break;
@@ -346,7 +536,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                         struct btrfs_delayed_tree_ref *ref;
  
                         ref = btrfs_delayed_node_to_tree_ref(node);
-                       ret = __add_prelim_ref(prefs, ref->root, info_key,
+                       ret = __add_prelim_ref(prefs, ref->root, NULL,
                                                ref->level + 1, ref->parent,
                                                node->bytenr,
                                                node->ref_mod * sgn);
@@ -354,8 +544,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                 }
                 case BTRFS_EXTENT_DATA_REF_KEY: {
                         struct btrfs_delayed_data_ref *ref;
-                       struct btrfs_key key;
-
                         ref = btrfs_delayed_node_to_data_ref(node);
  
                         key.objectid = ref->objectid;
@@ -368,7 +556,6 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
                 }
                 case BTRFS_SHARED_DATA_REF_KEY: {
                         struct btrfs_delayed_data_ref *ref;
-                       struct btrfs_key key;
  
                         ref = btrfs_delayed_node_to_data_ref(node);
  
@@ -394,8 +581,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
   */
  static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                              struct btrfs_path *path, u64 bytenr,
-                            struct btrfs_key *info_key, int *info_level,
-                            struct list_head *prefs)
+                            int *info_level, struct list_head *prefs)
  {
         int ret = 0;
         int slot;
@@ -411,7 +597,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
          * enumerate all inline refs
          */
         leaf = path->nodes[0];
-       slot = path->slots[0] - 1;
+       slot = path->slots[0];
  
         item_size = btrfs_item_size_nr(leaf, slot);
         BUG_ON(item_size < sizeof(*ei));
@@ -424,12 +610,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
  
         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                 struct btrfs_tree_block_info *info;
-               struct btrfs_disk_key disk_key;
  
                 info = (struct btrfs_tree_block_info *)ptr;
                 *info_level = btrfs_tree_block_level(leaf, info);
-               btrfs_tree_block_key(leaf, info, &disk_key);
-               btrfs_disk_key_to_cpu(info_key, &disk_key);
                 ptr += sizeof(struct btrfs_tree_block_info);
                 BUG_ON(ptr > end);
         } else {
@@ -447,7 +630,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
  
                 switch (type) {
                 case BTRFS_SHARED_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, 0, info_key,
+                       ret = __add_prelim_ref(prefs, 0, NULL,
                                                 *info_level + 1, offset,
                                                 bytenr, 1);
                         break;
@@ -462,8 +645,9 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                         break;
                 }
                 case BTRFS_TREE_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, offset, info_key,
-                                              *info_level + 1, 0, bytenr, 1);
+                       ret = __add_prelim_ref(prefs, offset, NULL,
+                                              *info_level + 1, 0,
+                                              bytenr, 1);
                         break;
                 case BTRFS_EXTENT_DATA_REF_KEY: {
                         struct btrfs_extent_data_ref *dref;
@@ -477,8 +661,8 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
                         key.type = BTRFS_EXTENT_DATA_KEY;
                         key.offset = btrfs_extent_data_ref_offset(leaf, dref);
                         root = btrfs_extent_data_ref_root(leaf, dref);
-                       ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr,
-                                               count);
+                       ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+                                              bytenr, count);
                         break;
                 }
                 default:
@@ -496,8 +680,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
   */
  static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                             struct btrfs_path *path, u64 bytenr,
-                           struct btrfs_key *info_key, int info_level,
-                           struct list_head *prefs)
+                           int info_level, struct list_head *prefs)
  {
         struct btrfs_root *extent_root = fs_info->extent_root;
         int ret;
@@ -527,7 +710,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
  
                 switch (key.type) {
                 case BTRFS_SHARED_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, 0, info_key,
+                       ret = __add_prelim_ref(prefs, 0, NULL,
                                                 info_level + 1, key.offset,
                                                 bytenr, 1);
                         break;
@@ -543,8 +726,9 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                         break;
                 }
                 case BTRFS_TREE_BLOCK_REF_KEY:
-                       ret = __add_prelim_ref(prefs, key.offset, info_key,
-                                               info_level + 1, 0, bytenr, 1);
+                       ret = __add_prelim_ref(prefs, key.offset, NULL,
+                                              info_level + 1, 0,
+                                              bytenr, 1);
                         break;
                 case BTRFS_EXTENT_DATA_REF_KEY: {
                         struct btrfs_extent_data_ref *dref;
@@ -560,7 +744,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
                         key.offset = btrfs_extent_data_ref_offset(leaf, dref);
                         root = btrfs_extent_data_ref_root(leaf, dref);
                         ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-                                               bytenr, count);
+                                              bytenr, count);
                         break;
                 }
                 default:
@@ -582,11 +766,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
   */
  static int find_parent_nodes(struct btrfs_trans_handle *trans,
                              struct btrfs_fs_info *fs_info, u64 bytenr,
-                            u64 seq, struct ulist *refs, struct ulist *roots)
+                            u64 delayed_ref_seq, u64 time_seq,
+                            struct ulist *refs, struct ulist *roots,
+                            const u64 *extent_item_pos)
  {
         struct btrfs_key key;
         struct btrfs_path *path;
-       struct btrfs_key info_key = { 0 };
         struct btrfs_delayed_ref_root *delayed_refs = NULL;
         struct btrfs_delayed_ref_head *head;
         int info_level = 0;
@@ -645,7 +830,7 @@ again:
                                 btrfs_put_delayed_ref(&head->node);
                                 goto again;
                         }
-                       ret = __add_delayed_refs(head, seq, &info_key,
+                       ret = __add_delayed_refs(head, delayed_ref_seq,
                                                  &prefs_delayed);
                         if (ret) {
                                 spin_unlock(&delayed_refs->lock);
@@ -659,16 +844,17 @@ again:
                 struct extent_buffer *leaf;
                 int slot;
  
+               path->slots[0]--;
                 leaf = path->nodes[0];
-               slot = path->slots[0] - 1;
+               slot = path->slots[0];
                 btrfs_item_key_to_cpu(leaf, &key, slot);
                 if (key.objectid == bytenr &&
                     key.type == BTRFS_EXTENT_ITEM_KEY) {
                         ret = __add_inline_refs(fs_info, path, bytenr,
-                                               &info_key, &info_level, &prefs);
+                                               &info_level, &prefs);
                         if (ret)
                                 goto out;
-                       ret = __add_keyed_refs(fs_info, path, bytenr, &info_key,
+                       ret = __add_keyed_refs(fs_info, path, bytenr,
                                                info_level, &prefs);
                         if (ret)
                                 goto out;
@@ -676,21 +862,18 @@ again:
         }
         btrfs_release_path(path);
  
-       /*
-        * when adding the delayed refs above, the info_key might not have
-        * been known yet. Go over the list and replace the missing keys
-        */
-       list_for_each_entry(ref, &prefs_delayed, list) {
-               if ((ref->key.offset | ref->key.type | ref->key.objectid) == 0)
-                       memcpy(&ref->key, &info_key, sizeof(ref->key));
-       }
         list_splice_init(&prefs_delayed, &prefs);
  
+       ret = __add_missing_keys(fs_info, &prefs);
+       if (ret)
+               goto out;
+
         ret = __merge_refs(&prefs, 1);
         if (ret)
                 goto out;
  
-       ret = __resolve_indirect_refs(fs_info, search_commit_root, &prefs);
+       ret = __resolve_indirect_refs(fs_info, search_commit_root, time_seq,
+                                     &prefs, extent_item_pos);
         if (ret)
                 goto out;
  
@@ -709,7 +892,33 @@ again:
                         BUG_ON(ret < 0);
                 }
                 if (ref->count && ref->parent) {
-                       ret = ulist_add(refs, ref->parent, 0, GFP_NOFS);
+                       struct extent_inode_elem *eie = NULL;
+                       if (extent_item_pos && !ref->inode_list) {
+                               u32 bsz;
+                               struct extent_buffer *eb;
+                               bsz = btrfs_level_size(fs_info->extent_root,
+                                                       info_level);
+                               eb = read_tree_block(fs_info->extent_root,
+                                                          ref->parent, bsz, 0);
+                               BUG_ON(!eb);
+                               ret = find_extent_in_eb(eb, bytenr,
+                                                       *extent_item_pos, &eie);
+                               ref->inode_list = eie;
+                               free_extent_buffer(eb);
+                       }
+                       ret = ulist_add_merge(refs, ref->parent,
+                                             (unsigned long)ref->inode_list,
+                                             (unsigned long *)&eie, GFP_NOFS);
+                       if (!ret && extent_item_pos) {
+                               /*
+                                * we've recorded that parent, so we must extend
+                                * its inode list here
+                                */
+                               BUG_ON(!eie);
+                               while (eie->next)
+                                       eie = eie->next;
+                               eie->next = ref->inode_list;
+                       }
                         BUG_ON(ret < 0);
                 }
                 kfree(ref);
@@ -734,6 +943,28 @@ out:
         return ret;
  }
  
+static void free_leaf_list(struct ulist *blocks)
+{
+       struct ulist_node *node = NULL;
+       struct extent_inode_elem *eie;
+       struct extent_inode_elem *eie_next;
+       struct ulist_iterator uiter;
+
+       ULIST_ITER_INIT(&uiter);
+       while ((node = ulist_next(blocks, &uiter))) {
+               if (!node->aux)
+                       continue;
+               eie = (struct extent_inode_elem *)node->aux;
+               for (; eie; eie = eie_next) {
+                       eie_next = eie->next;
+                       kfree(eie);
+               }
+               node->aux = 0;
+       }
+
+       ulist_free(blocks);
+}
+
  /*
   * Finds all leafs with a reference to the specified combination of bytenr and
   * offset. key_list_head will point to a list of corresponding keys (caller must
@@ -744,7 +975,9 @@ out:
   */
  static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                                 struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **leafs)
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **leafs,
+                               const u64 *extent_item_pos)
  {
         struct ulist *tmp;
         int ret;
@@ -758,11 +991,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
                 return -ENOMEM;
         }
  
-       ret = find_parent_nodes(trans, fs_info, bytenr, seq, *leafs, tmp);
+       ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+                               time_seq, *leafs, tmp, extent_item_pos);
         ulist_free(tmp);
  
         if (ret < 0 && ret != -ENOENT) {
-               ulist_free(*leafs);
+               free_leaf_list(*leafs);
                 return ret;
         }
  
@@ -784,10 +1018,12 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
   */
  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                 struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **roots)
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **roots)
  {
         struct ulist *tmp;
         struct ulist_node *node = NULL;
+       struct ulist_iterator uiter;
         int ret;
  
         tmp = ulist_alloc(GFP_NOFS);
@@ -799,15 +1035,16 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                 return -ENOMEM;
         }
  
+       ULIST_ITER_INIT(&uiter);
         while (1) {
-               ret = find_parent_nodes(trans, fs_info, bytenr, seq,
-                                       tmp, *roots);
+               ret = find_parent_nodes(trans, fs_info, bytenr, delayed_ref_seq,
+                                       time_seq, tmp, *roots, NULL);
                 if (ret < 0 && ret != -ENOENT) {
                         ulist_free(tmp);
                         ulist_free(*roots);
                         return ret;
                 }
-               node = ulist_next(tmp, node);
+               node = ulist_next(tmp, &uiter);
                 if (!node)
                         break;
                 bytenr = node->val;
@@ -1093,67 +1330,25 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
         return 0;
  }
  
-static int iterate_leaf_refs(struct btrfs_fs_info *fs_info, u64 logical,
-                               u64 orig_extent_item_objectid,
-                               u64 extent_item_pos, u64 root,
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+                               u64 root, u64 extent_item_objectid,
                                 iterate_extent_inodes_t *iterate, void *ctx)
  {
-       u64 disk_byte;
-       struct btrfs_key key;
-       struct btrfs_file_extent_item *fi;
-       struct extent_buffer *eb;
-       int slot;
-       int nritems;
+       struct extent_inode_elem *eie;
         int ret = 0;
-       int extent_type;
-       u64 data_offset;
-       u64 data_len;
-
-       eb = read_tree_block(fs_info->tree_root, logical,
-                               fs_info->tree_root->leafsize, 0);
-       if (!eb)
-               return -EIO;
-
-       /*
-        * from the shared data ref, we only have the leaf but we need
-        * the key. thus, we must look into all items and see that we
-        * find one (some) with a reference to our extent item.
-        */
-       nritems = btrfs_header_nritems(eb);
-       for (slot = 0; slot < nritems; ++slot) {
-               btrfs_item_key_to_cpu(eb, &key, slot);
-               if (key.type != BTRFS_EXTENT_DATA_KEY)
-                       continue;
-               fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-               extent_type = btrfs_file_extent_type(eb, fi);
-               if (extent_type == BTRFS_FILE_EXTENT_INLINE)
-                       continue;
-               /* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
-               disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-               if (disk_byte != orig_extent_item_objectid)
-                       continue;
-
-               data_offset = btrfs_file_extent_offset(eb, fi);
-               data_len = btrfs_file_extent_num_bytes(eb, fi);
-
-               if (extent_item_pos < data_offset ||
-                   extent_item_pos >= data_offset + data_len)
-                       continue;
  
+       for (eie = inode_list; eie; eie = eie->next) {
                 pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
-                               "root %llu\n", orig_extent_item_objectid,
-                               key.objectid, key.offset, root);
-               ret = iterate(key.objectid,
-                               key.offset + (extent_item_pos - data_offset),
-                               root, ctx);
+                        "root %llu\n", extent_item_objectid,
+                        eie->inum, eie->offset, root);
+               ret = iterate(eie->inum, eie->offset, root, ctx);
                 if (ret) {
-                       pr_debug("stopping iteration because ret=%d\n", ret);
+                       pr_debug("stopping iteration for %llu due to ret=%d\n",
+                                extent_item_objectid, ret);
                         break;
                 }
         }
  
-       free_extent_buffer(eb);
-
         return ret;
  }
  
@@ -1175,7 +1370,10 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
         struct ulist *roots = NULL;
         struct ulist_node *ref_node = NULL;
         struct ulist_node *root_node = NULL;
-       struct seq_list seq_elem;
+       struct seq_list seq_elem = {};
+       struct seq_list tree_mod_seq_elem = {};
+       struct ulist_iterator ref_uiter;
+       struct ulist_iterator root_uiter;
         struct btrfs_delayed_ref_root *delayed_refs = NULL;
  
         pr_debug("resolving all inodes for extent %llu\n",
@@ -1192,34 +1390,41 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
                 spin_lock(&delayed_refs->lock);
                 btrfs_get_delayed_seq(delayed_refs, &seq_elem);
                 spin_unlock(&delayed_refs->lock);
+               btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
         }
  
         ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
-                                  extent_item_pos, seq_elem.seq,
-                                  &refs);
-
+                                  seq_elem.seq, tree_mod_seq_elem.seq, &refs,
+                                  &extent_item_pos);
         if (ret)
                 goto out;
  
-       while (!ret && (ref_node = ulist_next(refs, ref_node))) {
-               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val, -1,
-                                               seq_elem.seq, &roots);
+       ULIST_ITER_INIT(&ref_uiter);
+       while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+               ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+                                               seq_elem.seq,
+                                               tree_mod_seq_elem.seq, &roots);
                 if (ret)
                         break;
-               while (!ret && (root_node = ulist_next(roots, root_node))) {
-                       pr_debug("root %llu references leaf %llu\n",
-                                       root_node->val, ref_node->val);
-                       ret = iterate_leaf_refs(fs_info, ref_node->val,
-                                               extent_item_objectid,
-                                               extent_item_pos, root_node->val,
-                                               iterate, ctx);
+               ULIST_ITER_INIT(&root_uiter);
+               while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+                       pr_debug("root %llu references leaf %llu, data list "
+                                "%#lx\n", root_node->val, ref_node->val,
+                                ref_node->aux);
+                       ret = iterate_leaf_refs(
+                               (struct extent_inode_elem *)ref_node->aux,
+                               root_node->val, extent_item_objectid,
+                               iterate, ctx);
                 }
+               ulist_free(roots);
+               roots = NULL;
         }
  
-       ulist_free(refs);
+       free_leaf_list(refs);
         ulist_free(roots);
  out:
         if (!search_commit_root) {
+               btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
                 btrfs_put_delayed_seq(delayed_refs, &seq_elem);
                 btrfs_end_transaction(trans, fs_info->extent_root);
         }
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h

index 57ea2e959e4dcfaba89e4ee0b833f5744c3639d3..c18d8ac7b795da487c4a526979954e91cbddf52b 100644 (file)
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -58,7 +58,8 @@ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
  
  int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
                                 struct btrfs_fs_info *fs_info, u64 bytenr,
-                               u64 num_bytes, u64 seq, struct ulist **roots);
+                               u64 delayed_ref_seq, u64 time_seq,
+                               struct ulist **roots);
  
  struct btrfs_data_container *init_data_container(u32 total_bytes);
  struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h

index 9b9b15fd5204347c5ef2931fb186af679cb0d369..e616f8872e69bb0cf3b9a3f369ba49eeca57f2de 100644 (file)
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -24,6 +24,20 @@
  #include "ordered-data.h"
  #include "delayed-inode.h"
  
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero.  When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE         0
+#define BTRFS_INODE_ORPHAN_META_RESERVED       1
+#define BTRFS_INODE_DUMMY                      2
+#define BTRFS_INODE_IN_DEFRAG                  3
+#define BTRFS_INODE_DELALLOC_META_RESERVED     4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM            5
+
  /* in memory btrfs inode */
  struct btrfs_inode {
         /* which subvolume this inode belongs to */
@@ -57,9 +71,6 @@ struct btrfs_inode {
         /* used to order data wrt metadata */
         struct btrfs_ordered_inode_tree ordered_tree;
  
-       /* for keeping track of orphaned inodes */
-       struct list_head i_orphan;
-
         /* list of all the delalloc inodes in the FS.  There are times we need
          * to write all the delalloc pages to disk, and this list is used
          * to walk them all.
@@ -78,14 +89,13 @@ struct btrfs_inode {
         /* the space_info for where this inode's data allocations are done */
         struct btrfs_space_info *space_info;
  
+       unsigned long runtime_flags;
+
         /* full 64 bit generation number, struct vfs_inode doesn't have a big
          * enough field for this.
          */
         u64 generation;
  
-       /* sequence number for NFS changes */
-       u64 sequence;
-
         /*
          * transid of the trans_handle that last modified this inode
          */
@@ -144,23 +154,10 @@ struct btrfs_inode {
         unsigned outstanding_extents;
         unsigned reserved_extents;
  
-       /*
-        * ordered_data_close is set by truncate when a file that used
-        * to have good data has been truncated to zero.  When it is set
-        * the btrfs file release call will add this inode to the
-        * ordered operations list so that we make sure to flush out any
-        * new data the application may have written before commit.
-        */
-       unsigned ordered_data_close:1;
-       unsigned orphan_meta_reserved:1;
-       unsigned dummy_inode:1;
-       unsigned in_defrag:1;
-       unsigned delalloc_meta_reserved:1;
-
         /*
          * always compress this one file
          */
-       unsigned force_compress:4;
+       unsigned force_compress;
  
         struct btrfs_delayed_node *delayed_node;
  
@@ -202,4 +199,17 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
         return false;
  }
  
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       int ret = 0;
+
+       mutex_lock(&root->log_mutex);
+       if (BTRFS_I(inode)->logged_trans == generation &&
+           BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
+               ret = 1;
+       mutex_unlock(&root->log_mutex);
+       return ret;
+}
+
  #endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c

index c053e90f2006f580ed4f8a4440fb520639c3edd6..9cebb1fd6a3cc59919c7c990d3016caee52b5849 100644 (file)
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -103,8 +103,6 @@
  #define BTRFSIC_BLOCK_STACK_FRAME_MAGIC_NUMBER 20111300
  #define BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL (200 - 6)   /* in characters,
                                                          * excluding " [...]" */
-#define BTRFSIC_BLOCK_SIZE PAGE_SIZE
-
  #define BTRFSIC_GENERATION_UNKNOWN ((u64)-1)
  
  /*
@@ -210,8 +208,9 @@ struct btrfsic_block_data_ctx {
         u64 dev_bytenr;         /* physical bytenr on device */
         u32 len;
         struct btrfsic_dev_state *dev;
-       char *data;
-       struct buffer_head *bh; /* do not use if set to NULL */
+       char **datav;
+       struct page **pagev;
+       void *mem_to_free;
  };
  
  /* This structure is used to implement recursion without occupying
@@ -243,6 +242,8 @@ struct btrfsic_state {
         struct btrfs_root *root;
         u64 max_superblock_generation;
         struct btrfsic_block *latest_superblock;
+       u32 metablock_size;
+       u32 datablock_size;
  };
  
  static void btrfsic_block_init(struct btrfsic_block *b);
@@ -290,8 +291,10 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
  static int btrfsic_process_metablock(struct btrfsic_state *state,
                                      struct btrfsic_block *block,
                                      struct btrfsic_block_data_ctx *block_ctx,
-                                    struct btrfs_header *hdr,
                                      int limit_nesting, int force_iodone_flag);
+static void btrfsic_read_from_block_data(
+       struct btrfsic_block_data_ctx *block_ctx,
+       void *dst, u32 offset, size_t len);
  static int btrfsic_create_link_to_next_block(
                 struct btrfsic_state *state,
                 struct btrfsic_block *block,
@@ -318,12 +321,13 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx);
  static int btrfsic_read_block(struct btrfsic_state *state,
                               struct btrfsic_block_data_ctx *block_ctx);
  static void btrfsic_dump_database(struct btrfsic_state *state);
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err);
  static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-                                    const u8 *data, unsigned int size);
+                                    char **datav, unsigned int num_pages);
  static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-                                         u64 dev_bytenr, u8 *mapped_data,
-                                         unsigned int len, struct bio *bio,
-                                         int *bio_is_patched,
+                                         u64 dev_bytenr, char **mapped_datav,
+                                         unsigned int num_pages,
+                                         struct bio *bio, int *bio_is_patched,
                                           struct buffer_head *bh,
                                           int submit_bio_bh_rw);
  static int btrfsic_process_written_superblock(
@@ -375,7 +379,7 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(
  static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                                            u64 bytenr,
                                            struct btrfsic_dev_state *dev_state,
-                                          u64 dev_bytenr, char *data);
+                                          u64 dev_bytenr);
  
  static struct mutex btrfsic_mutex;
  static int btrfsic_is_initialized;
@@ -651,7 +655,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
         int pass;
  
         BUG_ON(NULL == state);
-       selected_super = kmalloc(sizeof(*selected_super), GFP_NOFS);
+       selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS);
         if (NULL == selected_super) {
                 printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
                 return -1;
@@ -718,7 +722,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
  
                 num_copies =
                     btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                                (unsigned long long)next_bytenr, num_copies);
@@ -727,9 +731,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                         struct btrfsic_block *next_block;
                         struct btrfsic_block_data_ctx tmp_next_block_ctx;
                         struct btrfsic_block_link *l;
-                       struct btrfs_header *hdr;
  
-                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               state->metablock_size,
                                                 &tmp_next_block_ctx,
                                                 mirror_num);
                         if (ret) {
@@ -758,7 +762,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                         BUG_ON(NULL == l);
  
                         ret = btrfsic_read_block(state, &tmp_next_block_ctx);
-                       if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+                       if (ret < (int)PAGE_CACHE_SIZE) {
                                 printk(KERN_INFO
                                        "btrfsic: read @logical %llu failed!\n",
                                        (unsigned long long)
@@ -768,11 +772,9 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
                                 return -1;
                         }
  
-                       hdr = (struct btrfs_header *)tmp_next_block_ctx.data;
                         ret = btrfsic_process_metablock(state,
                                                         next_block,
                                                         &tmp_next_block_ctx,
-                                                       hdr,
                                                         BTRFS_MAX_LEVEL + 3, 1);
                         btrfsic_release_block_ctx(&tmp_next_block_ctx);
                 }
@@ -799,7 +801,10 @@ static int btrfsic_process_superblock_dev_mirror(
  
         /* super block bytenr is always the unmapped device bytenr */
         dev_bytenr = btrfs_sb_offset(superblock_mirror_num);
-       bh = __bread(superblock_bdev, dev_bytenr / 4096, 4096);
+       if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes)
+               return -1;
+       bh = __bread(superblock_bdev, dev_bytenr / 4096,
+                    BTRFS_SUPER_INFO_SIZE);
         if (NULL == bh)
                 return -1;
         super_tmp = (struct btrfs_super_block *)
@@ -808,7 +813,10 @@ static int btrfsic_process_superblock_dev_mirror(
         if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
             strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
                     sizeof(super_tmp->magic)) ||
-           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE)) {
+           memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
+           btrfs_super_nodesize(super_tmp) != state->metablock_size ||
+           btrfs_super_leafsize(super_tmp) != state->metablock_size ||
+           btrfs_super_sectorsize(super_tmp) != state->datablock_size) {
                 brelse(bh);
                 return 0;
         }
@@ -893,7 +901,7 @@ static int btrfsic_process_superblock_dev_mirror(
  
                 num_copies =
                     btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                                (unsigned long long)next_bytenr, num_copies);
@@ -902,7 +910,8 @@ static int btrfsic_process_superblock_dev_mirror(
                         struct btrfsic_block_data_ctx tmp_next_block_ctx;
                         struct btrfsic_block_link *l;
  
-                       if (btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       if (btrfsic_map_block(state, next_bytenr,
+                                             state->metablock_size,
                                               &tmp_next_block_ctx,
                                               mirror_num)) {
                                 printk(KERN_INFO "btrfsic: btrfsic_map_block("
@@ -966,13 +975,15 @@ static int btrfsic_process_metablock(
                 struct btrfsic_state *state,
                 struct btrfsic_block *const first_block,
                 struct btrfsic_block_data_ctx *const first_block_ctx,
-               struct btrfs_header *const first_hdr,
                 int first_limit_nesting, int force_iodone_flag)
  {
         struct btrfsic_stack_frame initial_stack_frame = { 0 };
         struct btrfsic_stack_frame *sf;
         struct btrfsic_stack_frame *next_stack;
+       struct btrfs_header *const first_hdr =
+               (struct btrfs_header *)first_block_ctx->datav[0];
  
+       BUG_ON(!first_hdr);
         sf = &initial_stack_frame;
         sf->error = 0;
         sf->i = -1;
@@ -1012,21 +1023,47 @@ continue_with_current_leaf_stack_frame:
                 }
  
                 if (sf->i < sf->nr) {
-                       struct btrfs_item *disk_item = leafhdr->items + sf->i;
-                       struct btrfs_disk_key *disk_key = &disk_item->key;
+                       struct btrfs_item disk_item;
+                       u32 disk_item_offset =
+                               (uintptr_t)(leafhdr->items + sf->i) -
+                               (uintptr_t)leafhdr;
+                       struct btrfs_disk_key *disk_key;
                         u8 type;
-                       const u32 item_offset = le32_to_cpu(disk_item->offset);
+                       u32 item_offset;
  
+                       if (disk_item_offset + sizeof(struct btrfs_item) >
+                           sf->block_ctx->len) {
+leaf_item_out_of_bounce_error:
+                               printk(KERN_INFO
+                                      "btrfsic: leaf item out of bounce at logical %llu, dev %s\n",
+                                      sf->block_ctx->start,
+                                      sf->block_ctx->dev->name);
+                               goto one_stack_frame_backwards;
+                       }
+                       btrfsic_read_from_block_data(sf->block_ctx,
+                                                    &disk_item,
+                                                    disk_item_offset,
+                                                    sizeof(struct btrfs_item));
+                       item_offset = le32_to_cpu(disk_item.offset);
+                       disk_key = &disk_item.key;
                         type = disk_key->type;
  
                         if (BTRFS_ROOT_ITEM_KEY == type) {
-                               const struct btrfs_root_item *const root_item =
-                                   (struct btrfs_root_item *)
-                                   (sf->block_ctx->data +
-                                    offsetof(struct btrfs_leaf, items) +
-                                    item_offset);
-                               const u64 next_bytenr =
-                                   le64_to_cpu(root_item->bytenr);
+                               struct btrfs_root_item root_item;
+                               u32 root_item_offset;
+                               u64 next_bytenr;
+
+                               root_item_offset = item_offset +
+                                       offsetof(struct btrfs_leaf, items);
+                               if (root_item_offset +
+                                   sizeof(struct btrfs_root_item) >
+                                   sf->block_ctx->len)
+                                       goto leaf_item_out_of_bounce_error;
+                               btrfsic_read_from_block_data(
+                                       sf->block_ctx, &root_item,
+                                       root_item_offset,
+                                       sizeof(struct btrfs_root_item));
+                               next_bytenr = le64_to_cpu(root_item.bytenr);
  
                                 sf->error =
                                     btrfsic_create_link_to_next_block(
@@ -1041,7 +1078,7 @@ continue_with_current_leaf_stack_frame:
                                                 &sf->num_copies,
                                                 &sf->mirror_num,
                                                 disk_key,
-                                               le64_to_cpu(root_item->
+                                               le64_to_cpu(root_item.
                                                 generation));
                                 if (sf->error)
                                         goto one_stack_frame_backwards;
@@ -1049,7 +1086,7 @@ continue_with_current_leaf_stack_frame:
                                 if (NULL != sf->next_block) {
                                         struct btrfs_header *const next_hdr =
                                             (struct btrfs_header *)
-                                           sf->next_block_ctx.data;
+                                           sf->next_block_ctx.datav[0];
  
                                         next_stack =
                                             btrfsic_stack_frame_alloc();
@@ -1111,10 +1148,24 @@ continue_with_current_node_stack_frame:
                 }
  
                 if (sf->i < sf->nr) {
-                       struct btrfs_key_ptr *disk_key_ptr =
-                           nodehdr->ptrs + sf->i;
-                       const u64 next_bytenr =
-                           le64_to_cpu(disk_key_ptr->blockptr);
+                       struct btrfs_key_ptr key_ptr;
+                       u32 key_ptr_offset;
+                       u64 next_bytenr;
+
+                       key_ptr_offset = (uintptr_t)(nodehdr->ptrs + sf->i) -
+                                         (uintptr_t)nodehdr;
+                       if (key_ptr_offset + sizeof(struct btrfs_key_ptr) >
+                           sf->block_ctx->len) {
+                               printk(KERN_INFO
+                                      "btrfsic: node item out of bounce at logical %llu, dev %s\n",
+                                      sf->block_ctx->start,
+                                      sf->block_ctx->dev->name);
+                               goto one_stack_frame_backwards;
+                       }
+                       btrfsic_read_from_block_data(
+                               sf->block_ctx, &key_ptr, key_ptr_offset,
+                               sizeof(struct btrfs_key_ptr));
+                       next_bytenr = le64_to_cpu(key_ptr.blockptr);
  
                         sf->error = btrfsic_create_link_to_next_block(
                                         state,
@@ -1127,15 +1178,15 @@ continue_with_current_node_stack_frame:
                                         force_iodone_flag,
                                         &sf->num_copies,
                                         &sf->mirror_num,
-                                       &disk_key_ptr->key,
-                                       le64_to_cpu(disk_key_ptr->generation));
+                                       &key_ptr.key,
+                                       le64_to_cpu(key_ptr.generation));
                         if (sf->error)
                                 goto one_stack_frame_backwards;
  
                         if (NULL != sf->next_block) {
                                 struct btrfs_header *const next_hdr =
                                     (struct btrfs_header *)
-                                   sf->next_block_ctx.data;
+                                   sf->next_block_ctx.datav[0];
  
                                 next_stack = btrfsic_stack_frame_alloc();
                                 if (NULL == next_stack)
@@ -1181,6 +1232,35 @@ one_stack_frame_backwards:
         return sf->error;
  }
  
+static void btrfsic_read_from_block_data(
+       struct btrfsic_block_data_ctx *block_ctx,
+       void *dstv, u32 offset, size_t len)
+{
+       size_t cur;
+       size_t offset_in_page;
+       char *kaddr;
+       char *dst = (char *)dstv;
+       size_t start_offset = block_ctx->start & ((u64)PAGE_CACHE_SIZE - 1);
+       unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
+
+       WARN_ON(offset + len > block_ctx->len);
+       offset_in_page = (start_offset + offset) &
+                        ((unsigned long)PAGE_CACHE_SIZE - 1);
+
+       while (len > 0) {
+               cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
+               BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT);
+               kaddr = block_ctx->datav[i];
+               memcpy(dst, kaddr + offset_in_page, cur);
+
+               dst += cur;
+               len -= cur;
+               offset_in_page = 0;
+               i++;
+       }
+}
+
  static int btrfsic_create_link_to_next_block(
                 struct btrfsic_state *state,
                 struct btrfsic_block *block,
@@ -1204,7 +1284,7 @@ static int btrfsic_create_link_to_next_block(
         if (0 == *num_copiesp) {
                 *num_copiesp =
                     btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->metablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                                (unsigned long long)next_bytenr, *num_copiesp);
@@ -1219,7 +1299,7 @@ static int btrfsic_create_link_to_next_block(
                        "btrfsic_create_link_to_next_block(mirror_num=%d)\n",
                        *mirror_nump);
         ret = btrfsic_map_block(state, next_bytenr,
-                               BTRFSIC_BLOCK_SIZE,
+                               state->metablock_size,
                                 next_block_ctx, *mirror_nump);
         if (ret) {
                 printk(KERN_INFO
@@ -1314,7 +1394,7 @@ static int btrfsic_create_link_to_next_block(
  
         if (limit_nesting > 0 && did_alloc_block_link) {
                 ret = btrfsic_read_block(state, next_block_ctx);
-               if (ret < (int)BTRFSIC_BLOCK_SIZE) {
+               if (ret < (int)next_block_ctx->len) {
                         printk(KERN_INFO
                                "btrfsic: read block @logical %llu failed!\n",
                                (unsigned long long)next_bytenr);
@@ -1339,43 +1419,74 @@ static int btrfsic_handle_extent_data(
                 u32 item_offset, int force_iodone_flag)
  {
         int ret;
-       struct btrfs_file_extent_item *file_extent_item =
-           (struct btrfs_file_extent_item *)(block_ctx->data +
-                                             offsetof(struct btrfs_leaf,
-                                                      items) + item_offset);
-       u64 next_bytenr =
-           le64_to_cpu(file_extent_item->disk_bytenr) +
-           le64_to_cpu(file_extent_item->offset);
-       u64 num_bytes = le64_to_cpu(file_extent_item->num_bytes);
-       u64 generation = le64_to_cpu(file_extent_item->generation);
+       struct btrfs_file_extent_item file_extent_item;
+       u64 file_extent_item_offset;
+       u64 next_bytenr;
+       u64 num_bytes;
+       u64 generation;
         struct btrfsic_block_link *l;
  
+       file_extent_item_offset = offsetof(struct btrfs_leaf, items) +
+                                 item_offset;
+       if (file_extent_item_offset +
+           offsetof(struct btrfs_file_extent_item, disk_num_bytes) >
+           block_ctx->len) {
+               printk(KERN_INFO
+                      "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+                      block_ctx->start, block_ctx->dev->name);
+               return -1;
+       }
+
+       btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+               file_extent_item_offset,
+               offsetof(struct btrfs_file_extent_item, disk_num_bytes));
+       if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
+           ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
+               if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
+                       printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
+                              file_extent_item.type,
+                              (unsigned long long)
+                              le64_to_cpu(file_extent_item.disk_bytenr));
+               return 0;
+       }
+
+       if (file_extent_item_offset + sizeof(struct btrfs_file_extent_item) >
+           block_ctx->len) {
+               printk(KERN_INFO
+                      "btrfsic: file item out of bounce at logical %llu, dev %s\n",
+                      block_ctx->start, block_ctx->dev->name);
+               return -1;
+       }
+       btrfsic_read_from_block_data(block_ctx, &file_extent_item,
+                                    file_extent_item_offset,
+                                    sizeof(struct btrfs_file_extent_item));
+       next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
+                     le64_to_cpu(file_extent_item.offset);
+       generation = le64_to_cpu(file_extent_item.generation);
+       num_bytes = le64_to_cpu(file_extent_item.num_bytes);
+       generation = le64_to_cpu(file_extent_item.generation);
+
         if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
                 printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
                        " offset = %llu, num_bytes = %llu\n",
-                      file_extent_item->type,
+                      file_extent_item.type,
                        (unsigned long long)
-                      le64_to_cpu(file_extent_item->disk_bytenr),
-                      (unsigned long long)
-                      le64_to_cpu(file_extent_item->offset),
-                      (unsigned long long)
-                      le64_to_cpu(file_extent_item->num_bytes));
-       if (BTRFS_FILE_EXTENT_REG != file_extent_item->type ||
-           ((u64)0) == le64_to_cpu(file_extent_item->disk_bytenr))
-               return 0;
+                      le64_to_cpu(file_extent_item.disk_bytenr),
+                      (unsigned long long)le64_to_cpu(file_extent_item.offset),
+                      (unsigned long long)num_bytes);
         while (num_bytes > 0) {
                 u32 chunk_len;
                 int num_copies;
                 int mirror_num;
  
-               if (num_bytes > BTRFSIC_BLOCK_SIZE)
-                       chunk_len = BTRFSIC_BLOCK_SIZE;
+               if (num_bytes > state->datablock_size)
+                       chunk_len = state->datablock_size;
                 else
                         chunk_len = num_bytes;
  
                 num_copies =
                     btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, state->datablock_size);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                                (unsigned long long)next_bytenr, num_copies);
@@ -1475,8 +1586,9 @@ static int btrfsic_map_block(struct btrfsic_state *state, u64 bytenr, u32 len,
         block_ctx_out->dev_bytenr = multi->stripes[0].physical;
         block_ctx_out->start = bytenr;
         block_ctx_out->len = len;
-       block_ctx_out->data = NULL;
-       block_ctx_out->bh = NULL;
+       block_ctx_out->datav = NULL;
+       block_ctx_out->pagev = NULL;
+       block_ctx_out->mem_to_free = NULL;
  
         if (0 == ret)
                 kfree(multi);
@@ -1496,8 +1608,9 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
         block_ctx_out->dev_bytenr = bytenr;
         block_ctx_out->start = bytenr;
         block_ctx_out->len = len;
-       block_ctx_out->data = NULL;
-       block_ctx_out->bh = NULL;
+       block_ctx_out->datav = NULL;
+       block_ctx_out->pagev = NULL;
+       block_ctx_out->mem_to_free = NULL;
         if (NULL != block_ctx_out->dev) {
                 return 0;
         } else {
@@ -1508,38 +1621,127 @@ static int btrfsic_map_superblock(struct btrfsic_state *state, u64 bytenr,
  
  static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
  {
-       if (NULL != block_ctx->bh) {
-               brelse(block_ctx->bh);
-               block_ctx->bh = NULL;
+       if (block_ctx->mem_to_free) {
+               unsigned int num_pages;
+
+               BUG_ON(!block_ctx->datav);
+               BUG_ON(!block_ctx->pagev);
+               num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+                           PAGE_CACHE_SHIFT;
+               while (num_pages > 0) {
+                       num_pages--;
+                       if (block_ctx->datav[num_pages]) {
+                               kunmap(block_ctx->pagev[num_pages]);
+                               block_ctx->datav[num_pages] = NULL;
+                       }
+                       if (block_ctx->pagev[num_pages]) {
+                               __free_page(block_ctx->pagev[num_pages]);
+                               block_ctx->pagev[num_pages] = NULL;
+                       }
+               }
+
+               kfree(block_ctx->mem_to_free);
+               block_ctx->mem_to_free = NULL;
+               block_ctx->pagev = NULL;
+               block_ctx->datav = NULL;
         }
  }
  
  static int btrfsic_read_block(struct btrfsic_state *state,
                               struct btrfsic_block_data_ctx *block_ctx)
  {
-       block_ctx->bh = NULL;
-       if (block_ctx->dev_bytenr & 4095) {
+       unsigned int num_pages;
+       unsigned int i;
+       u64 dev_bytenr;
+       int ret;
+
+       BUG_ON(block_ctx->datav);
+       BUG_ON(block_ctx->pagev);
+       BUG_ON(block_ctx->mem_to_free);
+       if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
                 printk(KERN_INFO
                        "btrfsic: read_block() with unaligned bytenr %llu\n",
                        (unsigned long long)block_ctx->dev_bytenr);
                 return -1;
         }
-       if (block_ctx->len > 4096) {
-               printk(KERN_INFO
-                      "btrfsic: read_block() with too huge size %d\n",
-                      block_ctx->len);
+
+       num_pages = (block_ctx->len + (u64)PAGE_CACHE_SIZE - 1) >>
+                   PAGE_CACHE_SHIFT;
+       block_ctx->mem_to_free = kzalloc((sizeof(*block_ctx->datav) +
+                                         sizeof(*block_ctx->pagev)) *
+                                        num_pages, GFP_NOFS);
+       if (!block_ctx->mem_to_free)
                 return -1;
+       block_ctx->datav = block_ctx->mem_to_free;
+       block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
+       for (i = 0; i < num_pages; i++) {
+               block_ctx->pagev[i] = alloc_page(GFP_NOFS);
+               if (!block_ctx->pagev[i])
+                       return -1;
         }
  
-       block_ctx->bh = __bread(block_ctx->dev->bdev,
-                               block_ctx->dev_bytenr >> 12, 4096);
-       if (NULL == block_ctx->bh)
-               return -1;
-       block_ctx->data = block_ctx->bh->b_data;
+       dev_bytenr = block_ctx->dev_bytenr;
+       for (i = 0; i < num_pages;) {
+               struct bio *bio;
+               unsigned int j;
+               DECLARE_COMPLETION_ONSTACK(complete);
+
+               bio = bio_alloc(GFP_NOFS, num_pages - i);
+               if (!bio) {
+                       printk(KERN_INFO
+                              "btrfsic: bio_alloc() for %u pages failed!\n",
+                              num_pages - i);
+                       return -1;
+               }
+               bio->bi_bdev = block_ctx->dev->bdev;
+               bio->bi_sector = dev_bytenr >> 9;
+               bio->bi_end_io = btrfsic_complete_bio_end_io;
+               bio->bi_private = &complete;
+
+               for (j = i; j < num_pages; j++) {
+                       ret = bio_add_page(bio, block_ctx->pagev[j],
+                                          PAGE_CACHE_SIZE, 0);
+                       if (PAGE_CACHE_SIZE != ret)
+                               break;
+               }
+               if (j == i) {
+                       printk(KERN_INFO
+                              "btrfsic: error, failed to add a single page!\n");
+                       return -1;
+               }
+               submit_bio(READ, bio);
+
+               /* this will also unplug the queue */
+               wait_for_completion(&complete);
+
+               if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                       printk(KERN_INFO
+                              "btrfsic: read error at logical %llu dev %s!\n",
+                              block_ctx->start, block_ctx->dev->name);
+                       bio_put(bio);
+                       return -1;
+               }
+               bio_put(bio);
+               dev_bytenr += (j - i) * PAGE_CACHE_SIZE;
+               i = j;
+       }
+       for (i = 0; i < num_pages; i++) {
+               block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+               if (!block_ctx->datav[i]) {
+                       printk(KERN_INFO "btrfsic: kmap() failed (dev %s)!\n",
+                              block_ctx->dev->name);
+                       return -1;
+               }
+       }
  
         return block_ctx->len;
  }
  
+static void btrfsic_complete_bio_end_io(struct bio *bio, int err)
+{
+       complete((struct completion *)bio->bi_private);
+}
+
  static void btrfsic_dump_database(struct btrfsic_state *state)
  {
         struct list_head *elem_all;
@@ -1617,32 +1819,39 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
   * (note that this test fails for the super block)
   */
  static int btrfsic_test_for_metadata(struct btrfsic_state *state,
-                                    const u8 *data, unsigned int size)
+                                    char **datav, unsigned int num_pages)
  {
         struct btrfs_header *h;
         u8 csum[BTRFS_CSUM_SIZE];
         u32 crc = ~(u32)0;
-       int fail = 0;
-       int crc_fail = 0;
+       unsigned int i;
  
-       h = (struct btrfs_header *)data;
+       if (num_pages * PAGE_CACHE_SIZE < state->metablock_size)
+               return 1; /* not metadata */
+       num_pages = state->metablock_size >> PAGE_CACHE_SHIFT;
+       h = (struct btrfs_header *)datav[0];
  
         if (memcmp(h->fsid, state->root->fs_info->fsid, BTRFS_UUID_SIZE))
-               fail++;
+               return 1;
+
+       for (i = 0; i < num_pages; i++) {
+               u8 *data = i ? datav[i] : (datav[i] + BTRFS_CSUM_SIZE);
+               size_t sublen = i ? PAGE_CACHE_SIZE :
+                                   (PAGE_CACHE_SIZE - BTRFS_CSUM_SIZE);
  
-       crc = crc32c(crc, data + BTRFS_CSUM_SIZE, PAGE_SIZE - BTRFS_CSUM_SIZE);
+               crc = crc32c(crc, data, sublen);
+       }
         btrfs_csum_final(crc, csum);
         if (memcmp(csum, h->csum, state->csum_size))
-               crc_fail++;
+               return 1;
  
-       return fail || crc_fail;
+       return 0; /* is metadata */
  }
  
  static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
-                                         u64 dev_bytenr,
-                                         u8 *mapped_data, unsigned int len,
-                                         struct bio *bio,
-                                         int *bio_is_patched,
+                                         u64 dev_bytenr, char **mapped_datav,
+                                         unsigned int num_pages,
+                                         struct bio *bio, int *bio_is_patched,
                                           struct buffer_head *bh,
                                           int submit_bio_bh_rw)
  {
@@ -1652,12 +1861,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
         int ret;
         struct btrfsic_state *state = dev_state->state;
         struct block_device *bdev = dev_state->bdev;
+       unsigned int processed_len;
  
-       WARN_ON(len > PAGE_SIZE);
-       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_data, len));
         if (NULL != bio_is_patched)
                 *bio_is_patched = 0;
  
+again:
+       if (num_pages == 0)
+               return;
+
+       processed_len = 0;
+       is_metadata = (0 == btrfsic_test_for_metadata(state, mapped_datav,
+                                                     num_pages));
+
         block = btrfsic_block_hashtable_lookup(bdev, dev_bytenr,
                                                &state->block_hashtable);
         if (NULL != block) {
@@ -1667,8 +1883,16 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
  
                 if (block->is_superblock) {
                         bytenr = le64_to_cpu(((struct btrfs_super_block *)
-                                             mapped_data)->bytenr);
+                                             mapped_datav[0])->bytenr);
+                       if (num_pages * PAGE_CACHE_SIZE <
+                           BTRFS_SUPER_INFO_SIZE) {
+                               printk(KERN_INFO
+                                      "btrfsic: cannot work with too short bios!\n");
+                               return;
+                       }
                         is_metadata = 1;
+                       BUG_ON(BTRFS_SUPER_INFO_SIZE & (PAGE_CACHE_SIZE - 1));
+                       processed_len = BTRFS_SUPER_INFO_SIZE;
                         if (state->print_mask &
                             BTRFSIC_PRINT_MASK_TREE_BEFORE_SB_WRITE) {
                                 printk(KERN_INFO
@@ -1678,12 +1902,18 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                 }
                 if (is_metadata) {
                         if (!block->is_superblock) {
+                               if (num_pages * PAGE_CACHE_SIZE <
+                                   state->metablock_size) {
+                                       printk(KERN_INFO
+                                              "btrfsic: cannot work with too short bios!\n");
+                                       return;
+                               }
+                               processed_len = state->metablock_size;
                                 bytenr = le64_to_cpu(((struct btrfs_header *)
-                                                     mapped_data)->bytenr);
+                                                     mapped_datav[0])->bytenr);
                                 btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
                                                                dev_state,
-                                                              dev_bytenr,
-                                                              mapped_data);
+                                                              dev_bytenr);
                         }
                         if (block->logical_bytenr != bytenr) {
                                 printk(KERN_INFO
@@ -1710,6 +1940,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                        block->mirror_num,
                                        btrfsic_get_block_type(state, block));
                 } else {
+                       if (num_pages * PAGE_CACHE_SIZE <
+                           state->datablock_size) {
+                               printk(KERN_INFO
+                                      "btrfsic: cannot work with too short bios!\n");
+                               return;
+                       }
+                       processed_len = state->datablock_size;
                         bytenr = block->logical_bytenr;
                         if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                 printk(KERN_INFO
@@ -1747,7 +1984,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                le64_to_cpu(block->disk_key.offset),
                                (unsigned long long)
                                le64_to_cpu(((struct btrfs_header *)
-                                           mapped_data)->generation),
+                                           mapped_datav[0])->generation),
                                (unsigned long long)
                                state->max_superblock_generation);
                         btrfsic_dump_tree(state);
@@ -1765,10 +2002,10 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                (unsigned long long)block->generation,
                                (unsigned long long)
                                le64_to_cpu(((struct btrfs_header *)
-                                           mapped_data)->generation));
+                                           mapped_datav[0])->generation));
                         /* it would not be safe to go on */
                         btrfsic_dump_tree(state);
-                       return;
+                       goto continue_loop;
                 }
  
                 /*
@@ -1796,18 +2033,19 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                 }
  
                 if (block->is_superblock)
-                       ret = btrfsic_map_superblock(state, bytenr, len,
+                       ret = btrfsic_map_superblock(state, bytenr,
+                                                    processed_len,
                                                      bdev, &block_ctx);
                 else
-                       ret = btrfsic_map_block(state, bytenr, len,
+                       ret = btrfsic_map_block(state, bytenr, processed_len,
                                                 &block_ctx, 0);
                 if (ret) {
                         printk(KERN_INFO
                                "btrfsic: btrfsic_map_block(root @%llu)"
                                " failed!\n", (unsigned long long)bytenr);
-                       return;
+                       goto continue_loop;
                 }
-               block_ctx.data = mapped_data;
+               block_ctx.datav = mapped_datav;
                 /* the following is required in case of writes to mirrors,
                  * use the same that was used for the lookup */
                 block_ctx.dev = dev_state;
@@ -1863,11 +2101,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                         block->logical_bytenr = bytenr;
                         block->is_metadata = 1;
                         if (block->is_superblock) {
+                               BUG_ON(PAGE_CACHE_SIZE !=
+                                      BTRFS_SUPER_INFO_SIZE);
                                 ret = btrfsic_process_written_superblock(
                                                 state,
                                                 block,
                                                 (struct btrfs_super_block *)
-                                               mapped_data);
+                                               mapped_datav[0]);
                                 if (state->print_mask &
                                     BTRFSIC_PRINT_MASK_TREE_AFTER_SB_WRITE) {
                                         printk(KERN_INFO
@@ -1880,8 +2120,6 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                                 state,
                                                 block,
                                                 &block_ctx,
-                                               (struct btrfs_header *)
-                                               block_ctx.data,
                                                 0, 0);
                         }
                         if (ret)
@@ -1912,26 +2150,30 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                 u64 bytenr;
  
                 if (!is_metadata) {
+                       processed_len = state->datablock_size;
                         if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                 printk(KERN_INFO "Written block (%s/%llu/?)"
                                        " !found in hash table, D.\n",
                                        dev_state->name,
                                        (unsigned long long)dev_bytenr);
-                       if (!state->include_extent_data)
-                               return; /* ignore that written D block */
+                       if (!state->include_extent_data) {
+                               /* ignore that written D block */
+                               goto continue_loop;
+                       }
  
                         /* this is getting ugly for the
                          * include_extent_data case... */
                         bytenr = 0;     /* unknown */
                         block_ctx.start = bytenr;
-                       block_ctx.len = len;
-                       block_ctx.bh = NULL;
+                       block_ctx.len = processed_len;
+                       block_ctx.mem_to_free = NULL;
+                       block_ctx.pagev = NULL;
                 } else {
+                       processed_len = state->metablock_size;
                         bytenr = le64_to_cpu(((struct btrfs_header *)
-                                             mapped_data)->bytenr);
+                                             mapped_datav[0])->bytenr);
                         btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
-                                                      dev_bytenr,
-                                                      mapped_data);
+                                                      dev_bytenr);
                         if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
                                 printk(KERN_INFO
                                        "Written block @%llu (%s/%llu/?)"
@@ -1940,17 +2182,17 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                                        dev_state->name,
                                        (unsigned long long)dev_bytenr);
  
-                       ret = btrfsic_map_block(state, bytenr, len, &block_ctx,
-                                               0);
+                       ret = btrfsic_map_block(state, bytenr, processed_len,
+                                               &block_ctx, 0);
                         if (ret) {
                                 printk(KERN_INFO
                                        "btrfsic: btrfsic_map_block(root @%llu)"
                                        " failed!\n",
                                        (unsigned long long)dev_bytenr);
-                               return;
+                               goto continue_loop;
                         }
                 }
-               block_ctx.data = mapped_data;
+               block_ctx.datav = mapped_datav;
                 /* the following is required in case of writes to mirrors,
                  * use the same that was used for the lookup */
                 block_ctx.dev = dev_state;
@@ -1960,7 +2202,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                 if (NULL == block) {
                         printk(KERN_INFO "btrfsic: error, kmalloc failed!\n");
                         btrfsic_release_block_ctx(&block_ctx);
-                       return;
+                       goto continue_loop;
                 }
                 block->dev_state = dev_state;
                 block->dev_bytenr = dev_bytenr;
@@ -2020,9 +2262,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
  
                 if (is_metadata) {
                         ret = btrfsic_process_metablock(state, block,
-                                                       &block_ctx,
-                                                       (struct btrfs_header *)
-                                                       block_ctx.data, 0, 0);
+                                                       &block_ctx, 0, 0);
                         if (ret)
                                 printk(KERN_INFO
                                        "btrfsic: process_metablock(root @%llu)"
@@ -2031,6 +2271,13 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state,
                 }
                 btrfsic_release_block_ctx(&block_ctx);
         }
+
+continue_loop:
+       BUG_ON(!processed_len);
+       dev_bytenr += processed_len;
+       mapped_datav += processed_len >> PAGE_CACHE_SHIFT;
+       num_pages -= processed_len >> PAGE_CACHE_SHIFT;
+       goto again;
  }
  
  static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
@@ -2213,7 +2460,7 @@ static int btrfsic_process_written_superblock(
  
                 num_copies =
                     btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                    next_bytenr, PAGE_SIZE);
+                                    next_bytenr, BTRFS_SUPER_INFO_SIZE);
                 if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
                         printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
                                (unsigned long long)next_bytenr, num_copies);
@@ -2224,7 +2471,8 @@ static int btrfsic_process_written_superblock(
                                 printk(KERN_INFO
                                        "btrfsic_process_written_superblock("
                                        "mirror_num=%d)\n", mirror_num);
-                       ret = btrfsic_map_block(state, next_bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, next_bytenr,
+                                               BTRFS_SUPER_INFO_SIZE,
                                                 &tmp_next_block_ctx,
                                                 mirror_num);
                         if (ret) {
@@ -2689,7 +2937,7 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
  static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                                            u64 bytenr,
                                            struct btrfsic_dev_state *dev_state,
-                                          u64 dev_bytenr, char *data)
+                                          u64 dev_bytenr)
  {
         int num_copies;
         int mirror_num;
@@ -2698,10 +2946,10 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
         int match = 0;
  
         num_copies = btrfs_num_copies(&state->root->fs_info->mapping_tree,
-                                     bytenr, PAGE_SIZE);
+                                     bytenr, state->metablock_size);
  
         for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-               ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+               ret = btrfsic_map_block(state, bytenr, state->metablock_size,
                                         &block_ctx, mirror_num);
                 if (ret) {
                         printk(KERN_INFO "btrfsic:"
@@ -2727,7 +2975,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
                        (unsigned long long)bytenr, dev_state->name,
                        (unsigned long long)dev_bytenr);
                 for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
-                       ret = btrfsic_map_block(state, bytenr, PAGE_SIZE,
+                       ret = btrfsic_map_block(state, bytenr,
+                                               state->metablock_size,
                                                 &block_ctx, mirror_num);
                         if (ret)
                                 continue;
@@ -2781,13 +3030,13 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
                                (unsigned long)bh->b_size, bh->b_data,
                                bh->b_bdev);
                 btrfsic_process_written_block(dev_state, dev_bytenr,
-                                             bh->b_data, bh->b_size, NULL,
+                                             &bh->b_data, 1, NULL,
                                               NULL, bh, rw);
         } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
                 if (dev_state->state->print_mask &
                     BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
                         printk(KERN_INFO
-                              "submit_bh(rw=0x%x) FLUSH, bdev=%p)\n",
+                              "submit_bh(rw=0x%x FLUSH, bdev=%p)\n",
                                rw, bh->b_bdev);
                 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
                         if ((dev_state->state->print_mask &
@@ -2836,6 +3085,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                 unsigned int i;
                 u64 dev_bytenr;
                 int bio_is_patched;
+               char **mapped_datav;
  
                 dev_bytenr = 512 * bio->bi_sector;
                 bio_is_patched = 0;
@@ -2848,35 +3098,46 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                                (unsigned long long)dev_bytenr,
                                bio->bi_bdev);
  
+               mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
+                                      GFP_NOFS);
+               if (!mapped_datav)
+                       goto leave;
                 for (i = 0; i < bio->bi_vcnt; i++) {
-                       u8 *mapped_data;
-
-                       mapped_data = kmap(bio->bi_io_vec[i].bv_page);
+                       BUG_ON(bio->bi_io_vec[i].bv_len != PAGE_CACHE_SIZE);
+                       mapped_datav[i] = kmap(bio->bi_io_vec[i].bv_page);
+                       if (!mapped_datav[i]) {
+                               while (i > 0) {
+                                       i--;
+                                       kunmap(bio->bi_io_vec[i].bv_page);
+                               }
+                               kfree(mapped_datav);
+                               goto leave;
+                       }
                         if ((BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                              BTRFSIC_PRINT_MASK_VERBOSE) ==
                             (dev_state->state->print_mask &
                              (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
                               BTRFSIC_PRINT_MASK_VERBOSE)))
                                 printk(KERN_INFO
-                                      "#%u: page=%p, mapped=%p, len=%u,"
-                                      " offset=%u\n",
+                                      "#%u: page=%p, len=%u, offset=%u\n",
                                        i, bio->bi_io_vec[i].bv_page,
-                                      mapped_data,
                                        bio->bi_io_vec[i].bv_len,
                                        bio->bi_io_vec[i].bv_offset);
-                       btrfsic_process_written_block(dev_state, dev_bytenr,
-                                                     mapped_data,
-                                                     bio->bi_io_vec[i].bv_len,
-                                                     bio, &bio_is_patched,
-                                                     NULL, rw);
+               }
+               btrfsic_process_written_block(dev_state, dev_bytenr,
+                                             mapped_datav, bio->bi_vcnt,
+                                             bio, &bio_is_patched,
+                                             NULL, rw);
+               while (i > 0) {
+                       i--;
                         kunmap(bio->bi_io_vec[i].bv_page);
-                       dev_bytenr += bio->bi_io_vec[i].bv_len;
                 }
+               kfree(mapped_datav);
         } else if (NULL != dev_state && (rw & REQ_FLUSH)) {
                 if (dev_state->state->print_mask &
                     BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
                         printk(KERN_INFO
-                              "submit_bio(rw=0x%x) FLUSH, bdev=%p)\n",
+                              "submit_bio(rw=0x%x FLUSH, bdev=%p)\n",
                                rw, bio->bi_bdev);
                 if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
                         if ((dev_state->state->print_mask &
@@ -2903,6 +3164,7 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
                         bio->bi_end_io = btrfsic_bio_end_io;
                 }
         }
+leave:
         mutex_unlock(&btrfsic_mutex);
  
         submit_bio(rw, bio);
@@ -2917,6 +3179,30 @@ int btrfsic_mount(struct btrfs_root *root,
         struct list_head *dev_head = &fs_devices->devices;
         struct btrfs_device *device;
  
+       if (root->nodesize != root->leafsize) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle nodesize %d != leafsize %d!\n",
+                      root->nodesize, root->leafsize);
+               return -1;
+       }
+       if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
+       if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
+       if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
+               printk(KERN_INFO
+                      "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
+                      root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
+               return -1;
+       }
         state = kzalloc(sizeof(*state), GFP_NOFS);
         if (NULL == state) {
                 printk(KERN_INFO "btrfs check-integrity: kmalloc() failed!\n");
@@ -2933,6 +3219,8 @@ int btrfsic_mount(struct btrfs_root *root,
         state->print_mask = print_mask;
         state->include_extent_data = including_extent_data;
         state->csum_size = 0;
+       state->metablock_size = root->nodesize;
+       state->datablock_size = root->sectorsize;
         INIT_LIST_HEAD(&state->all_blocks_list);
         btrfsic_block_hashtable_init(&state->block_hashtable);
         btrfsic_block_link_hashtable_init(&state->block_link_hashtable);
@@ -3049,7 +3337,7 @@ void btrfsic_unmount(struct btrfs_root *root,
                                 btrfsic_block_link_free(l);
                 }
  
-               if (b_all->is_iodone)
+               if (b_all->is_iodone || b_all->never_written)
                         btrfsic_block_free(b_all);
                 else
                         printk(KERN_INFO "btrfs: attempt to free %c-block"
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c

index 4106264fbc655ac79b26efa1177384ea92b72988..d7a96cfdc50ae6a2d8afef1dad7ca3642248bbb8 100644 (file)
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -18,6 +18,7 @@
  
  #include <linux/sched.h>
  #include <linux/slab.h>
+#include <linux/rbtree.h>
  #include "ctree.h"
  #include "disk-io.h"
  #include "transaction.h"
@@ -37,7 +38,16 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                               struct extent_buffer *dst_buf,
                               struct extent_buffer *src_buf);
  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                  struct btrfs_path *path, int level, int slot);
+                   struct btrfs_path *path, int level, int slot,
+                   int tree_mod_log);
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                                struct extent_buffer *eb);
+struct extent_buffer *read_old_tree_block(struct btrfs_root *root, u64 bytenr,
+                                         u32 blocksize, u64 parent_transid,
+                                         u64 time_seq);
+struct extent_buffer *btrfs_find_old_tree_block(struct btrfs_root *root,
+                                               u64 bytenr, u32 blocksize,
+                                               u64 time_seq);
  
  struct btrfs_path *btrfs_alloc_path(void)
  {
@@ -255,7 +265,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
  
         cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
                                      new_root_objectid, &disk_key, level,
-                                    buf->start, 0, 1);
+                                    buf->start, 0);
         if (IS_ERR(cow))
                 return PTR_ERR(cow);
  
@@ -288,6 +298,434 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
         return 0;
  }
  
+enum mod_log_op {
+       MOD_LOG_KEY_REPLACE,
+       MOD_LOG_KEY_ADD,
+       MOD_LOG_KEY_REMOVE,
+       MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+       MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+       MOD_LOG_MOVE_KEYS,
+       MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+       int dst_slot;
+       int nr_items;
+};
+
+struct tree_mod_root {
+       u64 logical;
+       u8 level;
+};
+
+struct tree_mod_elem {
+       struct rb_node node;
+       u64 index;              /* shifted logical */
+       struct seq_list elem;
+       enum mod_log_op op;
+
+       /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+       int slot;
+
+       /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+       u64 generation;
+
+       /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+       struct btrfs_disk_key key;
+       u64 blockptr;
+
+       /* this is used for op == MOD_LOG_MOVE_KEYS */
+       struct tree_mod_move move;
+
+       /* this is used for op == MOD_LOG_ROOT_REPLACE */
+       struct tree_mod_root old_root;
+};
+
+static inline void
+__get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem)
+{
+       elem->seq = atomic_inc_return(&fs_info->tree_mod_seq);
+       list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+}
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem)
+{
+       elem->flags = 1;
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       __get_tree_mod_seq(fs_info, elem);
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem)
+{
+       struct rb_root *tm_root;
+       struct rb_node *node;
+       struct rb_node *next;
+       struct seq_list *cur_elem;
+       struct tree_mod_elem *tm;
+       u64 min_seq = (u64)-1;
+       u64 seq_putting = elem->seq;
+
+       if (!seq_putting)
+               return;
+
+       BUG_ON(!(elem->flags & 1));
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       list_del(&elem->list);
+
+       list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+               if ((cur_elem->flags & 1) && cur_elem->seq < min_seq) {
+                       if (seq_putting > cur_elem->seq) {
+                               /*
+                                * blocker with lower sequence number exists, we
+                                * cannot remove anything from the log
+                                */
+                               goto out;
+                       }
+                       min_seq = cur_elem->seq;
+               }
+       }
+
+       /*
+        * anything that's lower than the lowest existing (read: blocked)
+        * sequence number can be removed from the tree.
+        */
+       write_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       for (node = rb_first(tm_root); node; node = next) {
+               next = rb_next(node);
+               tm = container_of(node, struct tree_mod_elem, node);
+               if (tm->elem.seq > min_seq)
+                       continue;
+               rb_erase(node, tm_root);
+               list_del(&tm->elem.list);
+               kfree(tm);
+       }
+       write_unlock(&fs_info->tree_mod_log_lock);
+out:
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+}
+
+/*
+ * key order of the log:
+ *       index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+       struct rb_root *tm_root;
+       struct rb_node **new;
+       struct rb_node *parent = NULL;
+       struct tree_mod_elem *cur;
+       int ret = 0;
+
+       BUG_ON(!tm || !tm->elem.seq);
+
+       write_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       new = &tm_root->rb_node;
+       while (*new) {
+               cur = container_of(*new, struct tree_mod_elem, node);
+               parent = *new;
+               if (cur->index < tm->index)
+                       new = &((*new)->rb_left);
+               else if (cur->index > tm->index)
+                       new = &((*new)->rb_right);
+               else if (cur->elem.seq < tm->elem.seq)
+                       new = &((*new)->rb_left);
+               else if (cur->elem.seq > tm->elem.seq)
+                       new = &((*new)->rb_right);
+               else {
+                       kfree(tm);
+                       ret = -EEXIST;
+                       goto unlock;
+               }
+       }
+
+       rb_link_node(&tm->node, parent, new);
+       rb_insert_color(&tm->node, tm_root);
+unlock:
+       write_unlock(&fs_info->tree_mod_log_lock);
+       return ret;
+}
+
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+                                   struct extent_buffer *eb) {
+       smp_mb();
+       if (list_empty(&(fs_info)->tree_mod_seq_list))
+               return 1;
+       if (!eb)
+               return 0;
+       if (btrfs_header_level(eb) == 0)
+               return 1;
+       return 0;
+}
+
+static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
+                                struct tree_mod_elem **tm_ret)
+{
+       struct tree_mod_elem *tm;
+       int seq;
+
+       if (tree_mod_dont_log(fs_info, NULL))
+               return 0;
+
+       tm = *tm_ret = kzalloc(sizeof(*tm), flags);
+       if (!tm)
+               return -ENOMEM;
+
+       tm->elem.flags = 0;
+       spin_lock(&fs_info->tree_mod_seq_lock);
+       if (list_empty(&fs_info->tree_mod_seq_list)) {
+               /*
+                * someone emptied the list while we were waiting for the lock.
+                * we must not add to the list, because no blocker exists. items
+                * are removed from the list only when the existing blocker is
+                * removed from the list.
+                */
+               kfree(tm);
+               seq = 0;
+       } else {
+               __get_tree_mod_seq(fs_info, &tm->elem);
+               seq = tm->elem.seq;
+       }
+       spin_unlock(&fs_info->tree_mod_seq_lock);
+
+       return seq;
+}
+
+static noinline int
+tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
+                            struct extent_buffer *eb, int slot,
+                            enum mod_log_op op, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = eb->start >> PAGE_CACHE_SHIFT;
+       if (op != MOD_LOG_KEY_ADD) {
+               btrfs_node_key(eb, &tm->key, slot);
+               tm->blockptr = btrfs_node_blockptr(eb, slot);
+       }
+       tm->op = op;
+       tm->slot = slot;
+       tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                       int slot, enum mod_log_op op)
+{
+       return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+                        struct extent_buffer *eb, int dst_slot, int src_slot,
+                        int nr_items, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+       int i;
+
+       if (tree_mod_dont_log(fs_info, eb))
+               return 0;
+
+       for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+               ret = tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+                                             MOD_LOG_KEY_REMOVE_WHILE_MOVING);
+               BUG_ON(ret < 0);
+       }
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = eb->start >> PAGE_CACHE_SHIFT;
+       tm->slot = src_slot;
+       tm->move.dst_slot = dst_slot;
+       tm->move.nr_items = nr_items;
+       tm->op = MOD_LOG_MOVE_KEYS;
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+                        struct extent_buffer *old_root,
+                        struct extent_buffer *new_root, gfp_t flags)
+{
+       struct tree_mod_elem *tm;
+       int ret;
+
+       ret = tree_mod_alloc(fs_info, flags, &tm);
+       if (ret <= 0)
+               return ret;
+
+       tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+       tm->old_root.logical = old_root->start;
+       tm->old_root.level = btrfs_header_level(old_root);
+       tm->generation = btrfs_header_generation(old_root);
+       tm->op = MOD_LOG_ROOT_REPLACE;
+
+       return __tree_mod_log_insert(fs_info, tm);
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+                     int smallest)
+{
+       struct rb_root *tm_root;
+       struct rb_node *node;
+       struct tree_mod_elem *cur = NULL;
+       struct tree_mod_elem *found = NULL;
+       u64 index = start >> PAGE_CACHE_SHIFT;
+
+       read_lock(&fs_info->tree_mod_log_lock);
+       tm_root = &fs_info->tree_mod_log;
+       node = tm_root->rb_node;
+       while (node) {
+               cur = container_of(node, struct tree_mod_elem, node);
+               if (cur->index < index) {
+                       node = node->rb_left;
+               } else if (cur->index > index) {
+                       node = node->rb_right;
+               } else if (cur->elem.seq < min_seq) {
+                       node = node->rb_left;
+               } else if (!smallest) {
+                       /* we want the node with the highest seq */
+                       if (found)
+                               BUG_ON(found->elem.seq > cur->elem.seq);
+                       found = cur;
+                       node = node->rb_left;
+               } else if (cur->elem.seq > min_seq) {
+                       /* we want the node with the smallest seq */
+                       if (found)
+                               BUG_ON(found->elem.seq < cur->elem.seq);
+                       found = cur;
+                       node = node->rb_right;
+               } else {
+                       found = cur;
+                       break;
+               }
+       }
+       read_unlock(&fs_info->tree_mod_log_lock);
+
+       return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+                          u64 min_seq)
+{
+       return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+       return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static inline void
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+                    struct extent_buffer *src, unsigned long dst_offset,
+                    unsigned long src_offset, int nr_items)
+{
+       int ret;
+       int i;
+
+       if (tree_mod_dont_log(fs_info, NULL))
+               return;
+
+       if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+               return;
+
+       /* speed this up by single seq for all operations? */
+       for (i = 0; i < nr_items; i++) {
+               ret = tree_mod_log_insert_key(fs_info, src, i + src_offset,
+                                             MOD_LOG_KEY_REMOVE);
+               BUG_ON(ret < 0);
+               ret = tree_mod_log_insert_key(fs_info, dst, i + dst_offset,
+                                             MOD_LOG_KEY_ADD);
+               BUG_ON(ret < 0);
+       }
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+                    int dst_offset, int src_offset, int nr_items)
+{
+       int ret;
+       ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+                                      nr_items, GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
+static inline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+                         struct extent_buffer *eb,
+                         struct btrfs_disk_key *disk_key, int slot, int atomic)
+{
+       int ret;
+
+       ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
+                                          MOD_LOG_KEY_REPLACE,
+                                          atomic ? GFP_ATOMIC : GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+                                struct extent_buffer *eb)
+{
+       int i;
+       int ret;
+       u32 nritems;
+
+       if (tree_mod_dont_log(fs_info, eb))
+               return;
+
+       nritems = btrfs_header_nritems(eb);
+       for (i = nritems - 1; i >= 0; i--) {
+               ret = tree_mod_log_insert_key(fs_info, eb, i,
+                                             MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+               BUG_ON(ret < 0);
+       }
+}
+
+static inline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+                             struct extent_buffer *new_root_node)
+{
+       int ret;
+       tree_mod_log_free_eb(root->fs_info, root->node);
+       ret = tree_mod_log_insert_root(root->fs_info, root->node,
+                                      new_root_node, GFP_NOFS);
+       BUG_ON(ret < 0);
+}
+
  /*
   * check if the tree block can be shared by multiple trees
   */
@@ -409,6 +847,12 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                         ret = btrfs_dec_ref(trans, root, buf, 1, 1);
                         BUG_ON(ret); /* -ENOMEM */
                 }
+               /*
+                * don't log freeing in case we're freeing the root node, this
+                * is done by tree_mod_log_set_root_pointer later
+                */
+               if (buf != root->node && btrfs_header_level(buf) != 0)
+                       tree_mod_log_free_eb(root->fs_info, buf);
                 clean_tree_block(trans, root, buf);
                 *last_ref = 1;
         }
@@ -467,7 +911,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
  
         cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
                                      root->root_key.objectid, &disk_key,
-                                    level, search_start, empty_size, 1);
+                                    level, search_start, empty_size);
         if (IS_ERR(cow))
                 return PTR_ERR(cow);
  
@@ -506,10 +950,11 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                         parent_start = 0;
  
                 extent_buffer_get(cow);
+               tree_mod_log_set_root_pointer(root, cow);
                 rcu_assign_pointer(root->node, cow);
  
                 btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref, 1);
+                                     last_ref);
                 free_extent_buffer(buf);
                 add_root_to_dirty_list(root);
         } else {
@@ -519,13 +964,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                         parent_start = 0;
  
                 WARN_ON(trans->transid != btrfs_header_generation(parent));
+               tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+                                       MOD_LOG_KEY_REPLACE);
                 btrfs_set_node_blockptr(parent, parent_slot,
                                         cow->start);
                 btrfs_set_node_ptr_generation(parent, parent_slot,
                                               trans->transid);
                 btrfs_mark_buffer_dirty(parent);
                 btrfs_free_tree_block(trans, root, buf, parent_start,
-                                     last_ref, 1);
+                                     last_ref);
         }
         if (unlock_orig)
                 btrfs_tree_unlock(buf);
@@ -535,6 +982,210 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
         return 0;
  }
  
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+                          struct btrfs_root *root, u64 time_seq)
+{
+       struct tree_mod_elem *tm;
+       struct tree_mod_elem *found = NULL;
+       u64 root_logical = root->node->start;
+       int looped = 0;
+
+       if (!time_seq)
+               return 0;
+
+       /*
+        * the very last operation that's logged for a root is the replacement
+        * operation (if it is replaced at all). this has the index of the *new*
+        * root, making it the very first operation that's logged for this root.
+        */
+       while (1) {
+               tm = tree_mod_log_search_oldest(fs_info, root_logical,
+                                               time_seq);
+               if (!looped && !tm)
+                       return 0;
+               /*
+                * we must have key remove operations in the log before the
+                * replace operation.
+                */
+               BUG_ON(!tm);
+
+               if (tm->op != MOD_LOG_ROOT_REPLACE)
+                       break;
+
+               found = tm;
+               root_logical = tm->old_root.logical;
+               BUG_ON(root_logical == root->node->start);
+               looped = 1;
+       }
+
+       return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
+                     struct tree_mod_elem *first_tm)
+{
+       u32 n;
+       struct rb_node *next;
+       struct tree_mod_elem *tm = first_tm;
+       unsigned long o_dst;
+       unsigned long o_src;
+       unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+       n = btrfs_header_nritems(eb);
+       while (tm && tm->elem.seq >= time_seq) {
+               /*
+                * all the operations are recorded with the operator used for
+                * the modification. as we're going backwards, we do the
+                * opposite of each operation here.
+                */
+               switch (tm->op) {
+               case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+                       BUG_ON(tm->slot < n);
+               case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+               case MOD_LOG_KEY_REMOVE:
+                       btrfs_set_node_key(eb, &tm->key, tm->slot);
+                       btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+                       btrfs_set_node_ptr_generation(eb, tm->slot,
+                                                     tm->generation);
+                       n++;
+                       break;
+               case MOD_LOG_KEY_REPLACE:
+                       BUG_ON(tm->slot >= n);
+                       btrfs_set_node_key(eb, &tm->key, tm->slot);
+                       btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+                       btrfs_set_node_ptr_generation(eb, tm->slot,
+                                                     tm->generation);
+                       break;
+               case MOD_LOG_KEY_ADD:
+                       if (tm->slot != n - 1) {
+                               o_dst = btrfs_node_key_ptr_offset(tm->slot);
+                               o_src = btrfs_node_key_ptr_offset(tm->slot + 1);
+                               memmove_extent_buffer(eb, o_dst, o_src, p_size);
+                       }
+                       n--;
+                       break;
+               case MOD_LOG_MOVE_KEYS:
+                       o_dst = btrfs_node_key_ptr_offset(tm->slot);
+                       o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+                       memmove_extent_buffer(eb, o_dst, o_src,
+                                             tm->move.nr_items * p_size);
+                       break;
+               case MOD_LOG_ROOT_REPLACE:
+                       /*
+                        * this operation is special. for roots, this must be
+                        * handled explicitly before rewinding.
+                        * for non-roots, this operation may exist if the node
+                        * was a root: root A -> child B; then A gets empty and
+                        * B is promoted to the new root. in the mod log, we'll
+                        * have a root-replace operation for B, a tree block
+                        * that is no root. we simply ignore that operation.
+                        */
+                       break;
+               }
+               next = rb_next(&tm->node);
+               if (!next)
+                       break;
+               tm = container_of(next, struct tree_mod_elem, node);
+               if (tm->index != first_tm->index)
+                       break;
+       }
+       btrfs_set_header_nritems(eb, n);
+}
+
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+                   u64 time_seq)
+{
+       struct extent_buffer *eb_rewin;
+       struct tree_mod_elem *tm;
+
+       if (!time_seq)
+               return eb;
+
+       if (btrfs_header_level(eb) == 0)
+               return eb;
+
+       tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+       if (!tm)
+               return eb;
+
+       if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+               BUG_ON(tm->slot != 0);
+               eb_rewin = alloc_dummy_extent_buffer(eb->start,
+                                               fs_info->tree_root->nodesize);
+               BUG_ON(!eb_rewin);
+               btrfs_set_header_bytenr(eb_rewin, eb->start);
+               btrfs_set_header_backref_rev(eb_rewin,
+                                            btrfs_header_backref_rev(eb));
+               btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+               btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+       } else {
+               eb_rewin = btrfs_clone_extent_buffer(eb);
+               BUG_ON(!eb_rewin);
+       }
+
+       extent_buffer_get(eb_rewin);
+       free_extent_buffer(eb);
+
+       __tree_mod_log_rewind(eb_rewin, time_seq, tm);
+
+       return eb_rewin;
+}
+
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+       struct tree_mod_elem *tm;
+       struct extent_buffer *eb;
+       struct tree_mod_root *old_root;
+       u64 old_generation;
+
+       tm = __tree_mod_log_oldest_root(root->fs_info, root, time_seq);
+       if (!tm)
+               return root->node;
+
+       old_root = &tm->old_root;
+       old_generation = tm->generation;
+
+       tm = tree_mod_log_search(root->fs_info, old_root->logical, time_seq);
+       /*
+        * there was an item in the log when __tree_mod_log_oldest_root
+        * returned. this one must not go away, because the time_seq passed to
+        * us must be blocking its removal.
+        */
+       BUG_ON(!tm);
+
+       if (old_root->logical == root->node->start) {
+               /* there are logged operations for the current root */
+               eb = btrfs_clone_extent_buffer(root->node);
+       } else {
+               /* there's a root replace operation for the current root */
+               eb = alloc_dummy_extent_buffer(tm->index << PAGE_CACHE_SHIFT,
+                                              root->nodesize);
+               btrfs_set_header_bytenr(eb, eb->start);
+               btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+               btrfs_set_header_owner(eb, root->root_key.objectid);
+       }
+       if (!eb)
+               return NULL;
+       btrfs_set_header_level(eb, old_root->level);
+       btrfs_set_header_generation(eb, old_generation);
+       __tree_mod_log_rewind(eb, time_seq, tm);
+
+       return eb;
+}
+
  static inline int should_cow_block(struct btrfs_trans_handle *trans,
                                    struct btrfs_root *root,
                                    struct extent_buffer *buf)
@@ -739,7 +1390,11 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                                 if (!cur)
                                         return -EIO;
                         } else if (!uptodate) {
-                               btrfs_read_buffer(cur, gen);
+                               err = btrfs_read_buffer(cur, gen);
+                               if (err) {
+                                       free_extent_buffer(cur);
+                                       return err;
+                               }
                         }
                 }
                 if (search_start == 0)
@@ -854,20 +1509,18 @@ static noinline int generic_bin_search(struct extent_buffer *eb,
  static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                       int level, int *slot)
  {
-       if (level == 0) {
+       if (level == 0)
                 return generic_bin_search(eb,
                                           offsetof(struct btrfs_leaf, items),
                                           sizeof(struct btrfs_item),
                                           key, btrfs_header_nritems(eb),
                                           slot);
-       } else {
+       else
                 return generic_bin_search(eb,
                                           offsetof(struct btrfs_node, ptrs),
                                           sizeof(struct btrfs_key_ptr),
                                           key, btrfs_header_nritems(eb),
                                           slot);
-       }
-       return -1;
  }
  
  int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -974,6 +1627,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                         goto enospc;
                 }
  
+               tree_mod_log_set_root_pointer(root, child);
                 rcu_assign_pointer(root->node, child);
  
                 add_root_to_dirty_list(root);
@@ -987,7 +1641,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 free_extent_buffer(mid);
  
                 root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+               btrfs_free_tree_block(trans, root, mid, 0, 1);
                 /* once for the root ptr */
                 free_extent_buffer_stale(mid);
                 return 0;
@@ -1040,14 +1694,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                 if (btrfs_header_nritems(right) == 0) {
                         clean_tree_block(trans, root, right);
                         btrfs_tree_unlock(right);
-                       del_ptr(trans, root, path, level + 1, pslot + 1);
+                       del_ptr(trans, root, path, level + 1, pslot + 1, 1);
                         root_sub_used(root, right->len);
-                       btrfs_free_tree_block(trans, root, right, 0, 1, 0);
+                       btrfs_free_tree_block(trans, root, right, 0, 1);
                         free_extent_buffer_stale(right);
                         right = NULL;
                 } else {
                         struct btrfs_disk_key right_key;
                         btrfs_node_key(right, &right_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &right_key, pslot + 1, 0);
                         btrfs_set_node_key(parent, &right_key, pslot + 1);
                         btrfs_mark_buffer_dirty(parent);
                 }
@@ -1082,15 +1738,17 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
         if (btrfs_header_nritems(mid) == 0) {
                 clean_tree_block(trans, root, mid);
                 btrfs_tree_unlock(mid);
-               del_ptr(trans, root, path, level + 1, pslot);
+               del_ptr(trans, root, path, level + 1, pslot, 1);
                 root_sub_used(root, mid->len);
-               btrfs_free_tree_block(trans, root, mid, 0, 1, 0);
+               btrfs_free_tree_block(trans, root, mid, 0, 1);
                 free_extent_buffer_stale(mid);
                 mid = NULL;
         } else {
                 /* update the parent key to reflect our changes */
                 struct btrfs_disk_key mid_key;
                 btrfs_node_key(mid, &mid_key, 0);
+               tree_mod_log_set_node_key(root->fs_info, parent, &mid_key,
+                                         pslot, 0);
                 btrfs_set_node_key(parent, &mid_key, pslot);
                 btrfs_mark_buffer_dirty(parent);
         }
@@ -1188,6 +1846,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                         struct btrfs_disk_key disk_key;
                         orig_slot += left_nr;
                         btrfs_node_key(mid, &disk_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &disk_key, pslot, 0);
                         btrfs_set_node_key(parent, &disk_key, pslot);
                         btrfs_mark_buffer_dirty(parent);
                         if (btrfs_header_nritems(left) > orig_slot) {
@@ -1239,6 +1899,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                         struct btrfs_disk_key disk_key;
  
                         btrfs_node_key(right, &disk_key, 0);
+                       tree_mod_log_set_node_key(root->fs_info, parent,
+                                                 &disk_key, pslot + 1, 0);
                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
                         btrfs_mark_buffer_dirty(parent);
  
@@ -1496,7 +2158,7 @@ static int
  read_block_for_search(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct btrfs_path *p,
                        struct extent_buffer **eb_ret, int level, int slot,
-                      struct btrfs_key *key)
+                      struct btrfs_key *key, u64 time_seq)
  {
         u64 blocknr;
         u64 gen;
@@ -1850,7 +2512,7 @@ cow_done:
                         }
  
                         err = read_block_for_search(trans, root, p,
-                                                   &b, level, slot, key);
+                                                   &b, level, slot, key, 0);
                         if (err == -EAGAIN)
                                 goto again;
                         if (err) {
@@ -1921,6 +2583,115 @@ done:
         return ret;
  }
  
+/*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+                         struct btrfs_path *p, u64 time_seq)
+{
+       struct extent_buffer *b;
+       int slot;
+       int ret;
+       int err;
+       int level;
+       int lowest_unlock = 1;
+       u8 lowest_level = 0;
+
+       lowest_level = p->lowest_level;
+       WARN_ON(p->nodes[0] != NULL);
+
+       if (p->search_commit_root) {
+               BUG_ON(time_seq);
+               return btrfs_search_slot(NULL, root, key, p, 0, 0);
+       }
+
+again:
+       b = get_old_root(root, time_seq);
+       extent_buffer_get(b);
+       level = btrfs_header_level(b);
+       btrfs_tree_read_lock(b);
+       p->locks[level] = BTRFS_READ_LOCK;
+
+       while (b) {
+               level = btrfs_header_level(b);
+               p->nodes[level] = b;
+               btrfs_clear_path_blocking(p, NULL, 0);
+
+               /*
+                * we have a lock on b and as long as we aren't changing
+                * the tree, there is no way to for the items in b to change.
+                * It is safe to drop the lock on our parent before we
+                * go through the expensive btree search on b.
+                */
+               btrfs_unlock_up_safe(p, level + 1);
+
+               ret = bin_search(b, key, level, &slot);
+
+               if (level != 0) {
+                       int dec = 0;
+                       if (ret && slot > 0) {
+                               dec = 1;
+                               slot -= 1;
+                       }
+                       p->slots[level] = slot;
+                       unlock_up(p, level, lowest_unlock, 0, NULL);
+
+                       if (level == lowest_level) {
+                               if (dec)
+                                       p->slots[level]++;
+                               goto done;
+                       }
+
+                       err = read_block_for_search(NULL, root, p, &b, level,
+                                                   slot, key, time_seq);
+                       if (err == -EAGAIN)
+                               goto again;
+                       if (err) {
+                               ret = err;
+                               goto done;
+                       }
+
+                       level = btrfs_header_level(b);
+                       err = btrfs_try_tree_read_lock(b);
+                       if (!err) {
+                               btrfs_set_path_blocking(p);
+                               btrfs_tree_read_lock(b);
+                               btrfs_clear_path_blocking(p, b,
+                                                         BTRFS_READ_LOCK);
+                       }
+                       p->locks[level] = BTRFS_READ_LOCK;
+                       p->nodes[level] = b;
+                       b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+                       if (b != p->nodes[level]) {
+                               btrfs_tree_unlock_rw(p->nodes[level],
+                                                    p->locks[level]);
+                               p->locks[level] = 0;
+                               p->nodes[level] = b;
+                       }
+               } else {
+                       p->slots[level] = slot;
+                       unlock_up(p, level, lowest_unlock, 0, NULL);
+                       goto done;
+               }
+       }
+       ret = 1;
+done:
+       if (!p->leave_spinning)
+               btrfs_set_path_blocking(p);
+       if (ret < 0)
+               btrfs_release_path(p);
+
+       return ret;
+}
+
  /*
   * adjust the pointers going up the tree, starting at level
   * making sure the right key of each node is points to 'key'.
@@ -1941,6 +2712,7 @@ static void fixup_low_keys(struct btrfs_trans_handle *trans,
                 if (!path->nodes[i])
                         break;
                 t = path->nodes[i];
+               tree_mod_log_set_node_key(root->fs_info, t, key, tslot, 1);
                 btrfs_set_node_key(t, key, tslot);
                 btrfs_mark_buffer_dirty(path->nodes[i]);
                 if (tslot != 0)
@@ -2023,12 +2795,16 @@ static int push_node_left(struct btrfs_trans_handle *trans,
         } else
                 push_items = min(src_nritems - 8, push_items);
  
+       tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+                            push_items);
         copy_extent_buffer(dst, src,
                            btrfs_node_key_ptr_offset(dst_nritems),
                            btrfs_node_key_ptr_offset(0),
                            push_items * sizeof(struct btrfs_key_ptr));
  
         if (push_items < src_nritems) {
+               tree_mod_log_eb_move(root->fs_info, src, 0, push_items,
+                                    src_nritems - push_items);
                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
                                       btrfs_node_key_ptr_offset(push_items),
                                       (src_nritems - push_items) *
@@ -2082,11 +2858,14 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
         if (max_push < push_items)
                 push_items = max_push;
  
+       tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
                                       btrfs_node_key_ptr_offset(0),
                                       (dst_nritems) *
                                       sizeof(struct btrfs_key_ptr));
  
+       tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+                            src_nritems - push_items, push_items);
         copy_extent_buffer(dst, src,
                            btrfs_node_key_ptr_offset(0),
                            btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2129,7 +2908,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
  
         c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                    root->root_key.objectid, &lower_key,
-                                  level, root->node->start, 0, 0);
+                                  level, root->node->start, 0);
         if (IS_ERR(c))
                 return PTR_ERR(c);
  
@@ -2161,6 +2940,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(c);
  
         old = root->node;
+       tree_mod_log_set_root_pointer(root, c);
         rcu_assign_pointer(root->node, c);
  
         /* the super has an extra ref to root->node */
@@ -2184,10 +2964,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
  static void insert_ptr(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct btrfs_path *path,
                        struct btrfs_disk_key *key, u64 bytenr,
-                      int slot, int level)
+                      int slot, int level, int tree_mod_log)
  {
         struct extent_buffer *lower;
         int nritems;
+       int ret;
  
         BUG_ON(!path->nodes[level]);
         btrfs_assert_tree_locked(path->nodes[level]);
@@ -2196,11 +2977,19 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
         BUG_ON(slot > nritems);
         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
         if (slot != nritems) {
+               if (tree_mod_log && level)
+                       tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+                                            slot, nritems - slot);
                 memmove_extent_buffer(lower,
                               btrfs_node_key_ptr_offset(slot + 1),
                               btrfs_node_key_ptr_offset(slot),
                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
         }
+       if (tree_mod_log && level) {
+               ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+                                             MOD_LOG_KEY_ADD);
+               BUG_ON(ret < 0);
+       }
         btrfs_set_node_key(lower, key, slot);
         btrfs_set_node_blockptr(lower, slot, bytenr);
         WARN_ON(trans->transid == 0);
@@ -2252,7 +3041,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
  
         split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
                                         root->root_key.objectid,
-                                       &disk_key, level, c->start, 0, 0);
+                                       &disk_key, level, c->start, 0);
         if (IS_ERR(split))
                 return PTR_ERR(split);
  
@@ -2271,7 +3060,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                             (unsigned long)btrfs_header_chunk_tree_uuid(split),
                             BTRFS_UUID_SIZE);
  
-
+       tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
         copy_extent_buffer(split, c,
                            btrfs_node_key_ptr_offset(0),
                            btrfs_node_key_ptr_offset(mid),
@@ -2284,7 +3073,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
         btrfs_mark_buffer_dirty(split);
  
         insert_ptr(trans, root, path, &disk_key, split->start,
-                  path->slots[level + 1] + 1, level + 1);
+                  path->slots[level + 1] + 1, level + 1, 1);
  
         if (path->slots[level] >= mid) {
                 path->slots[level] -= mid;
@@ -2821,7 +3610,7 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
         btrfs_set_header_nritems(l, mid);
         btrfs_item_key(right, &disk_key, 0);
         insert_ptr(trans, root, path, &disk_key, right->start,
-                  path->slots[1] + 1, 1);
+                  path->slots[1] + 1, 1, 0);
  
         btrfs_mark_buffer_dirty(right);
         btrfs_mark_buffer_dirty(l);
@@ -3004,7 +3793,7 @@ again:
  
         right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                         root->root_key.objectid,
-                                       &disk_key, 0, l->start, 0, 0);
+                                       &disk_key, 0, l->start, 0);
         if (IS_ERR(right))
                 return PTR_ERR(right);
  
@@ -3028,7 +3817,7 @@ again:
                 if (mid <= slot) {
                         btrfs_set_header_nritems(right, 0);
                         insert_ptr(trans, root, path, &disk_key, right->start,
-                                  path->slots[1] + 1, 1);
+                                  path->slots[1] + 1, 1, 0);
                         btrfs_tree_unlock(path->nodes[0]);
                         free_extent_buffer(path->nodes[0]);
                         path->nodes[0] = right;
@@ -3037,7 +3826,7 @@ again:
                 } else {
                         btrfs_set_header_nritems(right, 0);
                         insert_ptr(trans, root, path, &disk_key, right->start,
-                                         path->slots[1], 1);
+                                         path->slots[1], 1, 0);
                         btrfs_tree_unlock(path->nodes[0]);
                         free_extent_buffer(path->nodes[0]);
                         path->nodes[0] = right;
@@ -3749,19 +4538,29 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
   * empty a node.
   */
  static void del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                   struct btrfs_path *path, int level, int slot)
+                   struct btrfs_path *path, int level, int slot,
+                   int tree_mod_log)
  {
         struct extent_buffer *parent = path->nodes[level];
         u32 nritems;
+       int ret;
  
         nritems = btrfs_header_nritems(parent);
         if (slot != nritems - 1) {
+               if (tree_mod_log && level)
+                       tree_mod_log_eb_move(root->fs_info, parent, slot,
+                                            slot + 1, nritems - slot - 1);
                 memmove_extent_buffer(parent,
                               btrfs_node_key_ptr_offset(slot),
                               btrfs_node_key_ptr_offset(slot + 1),
                               sizeof(struct btrfs_key_ptr) *
                               (nritems - slot - 1));
+       } else if (tree_mod_log && level) {
+               ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+                                             MOD_LOG_KEY_REMOVE);
+               BUG_ON(ret < 0);
         }
+
         nritems--;
         btrfs_set_header_nritems(parent, nritems);
         if (nritems == 0 && parent == root->node) {
@@ -3793,7 +4592,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
                                     struct extent_buffer *leaf)
  {
         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-       del_ptr(trans, root, path, 1, path->slots[1]);
+       del_ptr(trans, root, path, 1, path->slots[1], 1);
  
         /*
          * btrfs_free_extent is expensive, we want to make sure we
@@ -3804,7 +4603,7 @@ static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
         root_sub_used(root, leaf->len);
  
         extent_buffer_get(leaf);
-       btrfs_free_tree_block(trans, root, leaf, 0, 1, 0);
+       btrfs_free_tree_block(trans, root, leaf, 0, 1);
         free_extent_buffer_stale(leaf);
  }
  /*
@@ -4271,7 +5070,7 @@ again:
                 next = c;
                 next_rw_lock = path->locks[level];
                 ret = read_block_for_search(NULL, root, path, &next, level,
-                                           slot, &key);
+                                           slot, &key, 0);
                 if (ret == -EAGAIN)
                         goto again;
  
@@ -4308,7 +5107,7 @@ again:
                         break;
  
                 ret = read_block_for_search(NULL, root, path, &next, level,
-                                           0, &key);
+                                           0, &key, 0);
                 if (ret == -EAGAIN)
                         goto again;
  
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h

index 8fd72331d6008c100e48db1c808566eb382187b2..0236d03c6732569a48a561049ea5a861d473da65 100644 (file)
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -173,6 +173,9 @@ static int btrfs_csum_sizes[] = { 4, 0 };
  #define BTRFS_FT_XATTR         8
  #define BTRFS_FT_MAX           9
  
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
  /*
   * The key defines the order in the tree, and so it also defines (optimal)
   * block layout.
@@ -823,6 +826,14 @@ struct btrfs_csum_item {
         u8 csum;
  } __attribute__ ((__packed__));
  
+struct btrfs_dev_stats_item {
+       /*
+        * grow this item struct at the end for future enhancements and keep
+        * the existing values unchanged
+        */
+       __le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
  /* different types of block groups (and chunks) */
  #define BTRFS_BLOCK_GROUP_DATA         (1ULL << 0)
  #define BTRFS_BLOCK_GROUP_SYSTEM       (1ULL << 1)
@@ -1129,6 +1140,15 @@ struct btrfs_fs_info {
         spinlock_t delayed_iput_lock;
         struct list_head delayed_iputs;
  
+       /* this protects tree_mod_seq_list */
+       spinlock_t tree_mod_seq_lock;
+       atomic_t tree_mod_seq;
+       struct list_head tree_mod_seq_list;
+
+       /* this protects tree_mod_log */
+       rwlock_t tree_mod_log_lock;
+       struct rb_root tree_mod_log;
+
         atomic_t nr_async_submits;
         atomic_t async_submit_draining;
         atomic_t nr_async_bios;
@@ -1375,7 +1395,7 @@ struct btrfs_root {
         struct list_head root_list;
  
         spinlock_t orphan_lock;
-       struct list_head orphan_list;
+       atomic_t orphan_inodes;
         struct btrfs_block_rsv *orphan_block_rsv;
         int orphan_item_inserted;
         int orphan_cleanup_state;
@@ -1507,6 +1527,12 @@ struct btrfs_ioctl_defrag_range_args {
  
  #define BTRFS_BALANCE_ITEM_KEY 248
  
+/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY    249
+
  /*
   * string items are for debugging.  They just store a short string of
   * data in the FS
@@ -2415,6 +2441,30 @@ static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb,
         return btrfs_item_size(eb, e) - offset;
  }
  
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+                                       struct btrfs_dev_stats_item *ptr,
+                                       int index)
+{
+       u64 val;
+
+       read_extent_buffer(eb, &val,
+                          offsetof(struct btrfs_dev_stats_item, values) +
+                           ((unsigned long)ptr) + (index * sizeof(u64)),
+                          sizeof(val));
+       return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+                                            struct btrfs_dev_stats_item *ptr,
+                                            int index, u64 val)
+{
+       write_extent_buffer(eb, &val,
+                           offsetof(struct btrfs_dev_stats_item, values) +
+                            ((unsigned long)ptr) + (index * sizeof(u64)),
+                           sizeof(val));
+}
+
  static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
  {
         return sb->s_fs_info;
@@ -2496,11 +2546,11 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root, u32 blocksize,
                                         u64 parent, u64 root_objectid,
                                         struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size, int for_cow);
+                                       u64 hint, u64 empty_size);
  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct extent_buffer *buf,
-                          u64 parent, int last_ref, int for_cow);
+                          u64 parent, int last_ref);
  struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
                                             u64 bytenr, u32 blocksize,
@@ -2659,6 +2709,8 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
  int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
                       *root, struct btrfs_key *key, struct btrfs_path *p, int
                       ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+                         struct btrfs_path *p, u64 time_seq);
  int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *parent,
                        int start_slot, int cache_only, u64 *last_ret,
@@ -2922,7 +2974,6 @@ int btrfs_readpage(struct file *file, struct page *page);
  void btrfs_evict_inode(struct inode *inode);
  int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
  int btrfs_dirty_inode(struct inode *inode);
-int btrfs_update_time(struct file *file);
  struct inode *btrfs_alloc_inode(struct super_block *sb);
  void btrfs_destroy_inode(struct inode *inode);
  int btrfs_drop_inode(struct inode *inode);
@@ -3098,4 +3149,23 @@ void btrfs_reada_detach(void *handle);
  int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
                          u64 start, int err);
  
+/* delayed seq elem */
+struct seq_list {
+       struct list_head list;
+       u64 seq;
+       u32 flags;
+};
+
+void btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+                           struct seq_list *elem);
+
+static inline int is_fstree(u64 rootid)
+{
+       if (rootid == BTRFS_FS_TREE_OBJECTID ||
+           (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+               return 1;
+       return 0;
+}
  #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c

index 03e3748d84d02407c19c6d46648667a56f13ba3e..c18d0442ae6daa69a564ebba400f9ad09573ea1d 100644 (file)
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -669,8 +669,8 @@ static int btrfs_delayed_inode_reserve_metadata(
                 return ret;
         } else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
                 spin_lock(&BTRFS_I(inode)->lock);
-               if (BTRFS_I(inode)->delalloc_meta_reserved) {
-                       BTRFS_I(inode)->delalloc_meta_reserved = 0;
+               if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                                      &BTRFS_I(inode)->runtime_flags)) {
                         spin_unlock(&BTRFS_I(inode)->lock);
                         release = true;
                         goto migrate;
@@ -1706,7 +1706,7 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
         btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
         btrfs_set_stack_inode_generation(inode_item,
                                          BTRFS_I(inode)->generation);
-       btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+       btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
         btrfs_set_stack_inode_transid(inode_item, trans->transid);
         btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
         btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1754,7 +1754,7 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
         set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
         inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
         BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
-       BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+       inode->i_version = btrfs_stack_inode_sequence(inode_item);
         inode->i_rdev = 0;
         *rdev = btrfs_stack_inode_rdev(inode_item);
         BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c

index 69f22e3ab3bc307974b5cae14f99310a498b54cf..13ae7b04790eaff72e8c23fb145fca8bfae88175 100644 (file)
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -525,7 +525,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
         ref->is_head = 0;
         ref->in_tree = 1;
  
-       if (need_ref_seq(for_cow, ref_root))
+       if (is_fstree(ref_root))
                 seq = inc_delayed_seq(delayed_refs);
         ref->seq = seq;
  
@@ -584,7 +584,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
         ref->is_head = 0;
         ref->in_tree = 1;
  
-       if (need_ref_seq(for_cow, ref_root))
+       if (is_fstree(ref_root))
                 seq = inc_delayed_seq(delayed_refs);
         ref->seq = seq;
  
@@ -658,10 +658,11 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
         add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
                                    num_bytes, parent, ref_root, level, action,
                                    for_cow);
-       if (!need_ref_seq(for_cow, ref_root) &&
+       if (!is_fstree(ref_root) &&
             waitqueue_active(&delayed_refs->seq_wait))
                 wake_up(&delayed_refs->seq_wait);
         spin_unlock(&delayed_refs->lock);
+
         return 0;
  }
  
@@ -706,10 +707,11 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
         add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
                                    num_bytes, parent, ref_root, owner, offset,
                                    action, for_cow);
-       if (!need_ref_seq(for_cow, ref_root) &&
+       if (!is_fstree(ref_root) &&
             waitqueue_active(&delayed_refs->seq_wait))
                 wake_up(&delayed_refs->seq_wait);
         spin_unlock(&delayed_refs->lock);
+
         return 0;
  }
  
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h

index d8f244d9492511e3b108b26bcf4da1bc9fbf6826..413927fb9957e41fdcfb82511e63d416b8a36c76 100644 (file)
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -195,11 +195,6 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
  int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                            struct list_head *cluster, u64 search_start);
  
-struct seq_list {
-       struct list_head list;
-       u64 seq;
-};
-
  static inline u64 inc_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs)
  {
         assert_spin_locked(&delayed_refs->lock);
@@ -229,25 +224,6 @@ btrfs_put_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
  int btrfs_check_delayed_seq(struct btrfs_delayed_ref_root *delayed_refs,
                             u64 seq);
  
-/*
- * delayed refs with a ref_seq > 0 must be held back during backref walking.
- * this only applies to items in one of the fs-trees. for_cow items never need
- * to be held back, so they won't get a ref_seq number.
- */
-static inline int need_ref_seq(int for_cow, u64 rootid)
-{
-       if (for_cow)
-               return 0;
-
-       if (rootid == BTRFS_FS_TREE_OBJECTID)
-               return 1;
-
-       if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
-               return 1;
-
-       return 0;
-}
-
  /*
   * a node might live in a head or a regular ref, this lets you
   * test for the proper type to use.
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c

index e1fe74a2ce16e6a4e0b38129160f484e642c42fa..7ae51decf6d3d0fb5c3d44bb7791f843c74aa376 100644 (file)
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1153,7 +1153,6 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
         root->orphan_block_rsv = NULL;
  
         INIT_LIST_HEAD(&root->dirty_list);
-       INIT_LIST_HEAD(&root->orphan_list);
         INIT_LIST_HEAD(&root->root_list);
         spin_lock_init(&root->orphan_lock);
         spin_lock_init(&root->inode_lock);
@@ -1166,6 +1165,7 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
         atomic_set(&root->log_commit[0], 0);
         atomic_set(&root->log_commit[1], 0);
         atomic_set(&root->log_writers, 0);
+       atomic_set(&root->orphan_inodes, 0);
         root->log_batch = 0;
         root->log_transid = 0;
         root->last_log_commit = 0;
@@ -1252,7 +1252,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
  
         leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                       BTRFS_TREE_LOG_OBJECTID, NULL,
-                                     0, 0, 0, 0);
+                                     0, 0, 0);
         if (IS_ERR(leaf)) {
                 kfree(root);
                 return ERR_CAST(leaf);
@@ -1914,11 +1914,14 @@ int open_ctree(struct super_block *sb,
         spin_lock_init(&fs_info->delayed_iput_lock);
         spin_lock_init(&fs_info->defrag_inodes_lock);
         spin_lock_init(&fs_info->free_chunk_lock);
+       spin_lock_init(&fs_info->tree_mod_seq_lock);
+       rwlock_init(&fs_info->tree_mod_log_lock);
         mutex_init(&fs_info->reloc_mutex);
  
         init_completion(&fs_info->kobj_unregister);
         INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
         INIT_LIST_HEAD(&fs_info->space_info);
+       INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
         btrfs_mapping_init(&fs_info->mapping_tree);
         btrfs_init_block_rsv(&fs_info->global_block_rsv);
         btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
@@ -1931,12 +1934,14 @@ int open_ctree(struct super_block *sb,
         atomic_set(&fs_info->async_submit_draining, 0);
         atomic_set(&fs_info->nr_async_bios, 0);
         atomic_set(&fs_info->defrag_running, 0);
+       atomic_set(&fs_info->tree_mod_seq, 0);
         fs_info->sb = sb;
         fs_info->max_inline = 8192 * 1024;
         fs_info->metadata_ratio = 0;
         fs_info->defrag_inodes = RB_ROOT;
         fs_info->trans_no_join = 0;
         fs_info->free_chunk_space = 0;
+       fs_info->tree_mod_log = RB_ROOT;
  
         /* readahead state */
         INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2001,7 +2006,8 @@ int open_ctree(struct super_block *sb,
         BTRFS_I(fs_info->btree_inode)->root = tree_root;
         memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
                sizeof(struct btrfs_key));
-       BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+       set_bit(BTRFS_INODE_DUMMY,
+               &BTRFS_I(fs_info->btree_inode)->runtime_flags);
         insert_inode_hash(fs_info->btree_inode);
  
         spin_lock_init(&fs_info->block_group_cache_lock);
@@ -2353,6 +2359,13 @@ retry_root_backup:
         fs_info->generation = generation;
         fs_info->last_trans_committed = generation;
  
+       ret = btrfs_init_dev_stats(fs_info);
+       if (ret) {
+               printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+                      ret);
+               goto fail_block_groups;
+       }
+
         ret = btrfs_init_space_info(fs_info);
         if (ret) {
                 printk(KERN_ERR "Failed to initial space info: %d\n", ret);
@@ -2556,18 +2569,19 @@ recovery_tree_root:
  
  static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
  {
-       char b[BDEVNAME_SIZE];
-
         if (uptodate) {
                 set_buffer_uptodate(bh);
         } else {
+               struct btrfs_device *device = (struct btrfs_device *)
+                       bh->b_private;
+
                 printk_ratelimited(KERN_WARNING "lost page write due to "
-                                       "I/O error on %s\n",
-                                      bdevname(bh->b_bdev, b));
+                                  "I/O error on %s\n", device->name);
                 /* note, we dont' set_buffer_write_io_error because we have
                  * our own ways of dealing with the IO errors
                  */
                 clear_buffer_uptodate(bh);
+               btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
         }
         unlock_buffer(bh);
         put_bh(bh);
@@ -2682,6 +2696,7 @@ static int write_dev_supers(struct btrfs_device *device,
                         set_buffer_uptodate(bh);
                         lock_buffer(bh);
                         bh->b_end_io = btrfs_end_buffer_write_sync;
+                       bh->b_private = device;
                 }
  
                 /*
@@ -2740,6 +2755,9 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
                 }
                 if (!bio_flagged(bio, BIO_UPTODATE)) {
                         ret = -EIO;
+                       if (!bio_flagged(bio, BIO_EOPNOTSUPP))
+                               btrfs_dev_stat_inc_and_print(device,
+                                       BTRFS_DEV_STAT_FLUSH_ERRS);
                 }
  
                 /* drop the reference from the wait == 0 run */
@@ -2902,19 +2920,6 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
         return ret;
  }
  
-/* Kill all outstanding I/O */
-void btrfs_abort_devices(struct btrfs_root *root)
-{
-       struct list_head *head;
-       struct btrfs_device *dev;
-       mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-       head = &root->fs_info->fs_devices->devices;
-       list_for_each_entry_rcu(dev, head, dev_list) {
-               blk_abort_queue(dev->bdev->bd_disk->queue);
-       }
-       mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-}
-
  void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
  {
         spin_lock(&fs_info->fs_roots_radix_lock);
@@ -3671,17 +3676,6 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
         return 0;
  }
  
-static int btree_writepage_io_failed_hook(struct bio *bio, struct page *page,
-                                         u64 start, u64 end,
-                                         struct extent_state *state)
-{
-       struct super_block *sb = page->mapping->host->i_sb;
-       struct btrfs_fs_info *fs_info = btrfs_sb(sb);
-       btrfs_error(fs_info, -EIO,
-                   "Error occured while writing out btree at %llu", start);
-       return -EIO;
-}
-
  static struct extent_io_ops btree_extent_io_ops = {
         .write_cache_pages_lock_hook = btree_lock_page_hook,
         .readpage_end_io_hook = btree_readpage_end_io_hook,
@@ -3689,5 +3683,4 @@ static struct extent_io_ops btree_extent_io_ops = {
         .submit_bio_hook = btree_submit_bio_hook,
         /* note we're sharing with inode.c for the merge bio hook */
         .merge_bio_hook = btrfs_merge_bio_hook,
-       .writepage_io_failed_hook = btree_writepage_io_failed_hook,
  };
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h

index ab1830aaf0edbffba6a0cef86d13e9b3f2742cda..05b3fab39f7e814fc8c958e125f5a14c7e39d7f9 100644 (file)
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -89,7 +89,6 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
  int btrfs_cleanup_transaction(struct btrfs_root *root);
  void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
                                   struct btrfs_root *root);
-void btrfs_abort_devices(struct btrfs_root *root);
  
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c

index e887ee62b6d4ba0a98f7e2437323eecfca88bf23..614f34a899c2db468792f1ef8406c5a366739258 100644 (file)
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -13,15 +13,14 @@
                                              parent_root_objectid) / 4)
  #define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4)
  
-static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
-                          int connectable)
+static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+                          struct inode *parent)
  {
         struct btrfs_fid *fid = (struct btrfs_fid *)fh;
-       struct inode *inode = dentry->d_inode;
         int len = *max_len;
         int type;
  
-       if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
+       if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
                 *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                 return 255;
         } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
@@ -36,19 +35,13 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
         fid->root_objectid = BTRFS_I(inode)->root->objectid;
         fid->gen = inode->i_generation;
  
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
+       if (parent) {
                 u64 parent_root_id;
  
-               spin_lock(&dentry->d_lock);
-
-               parent = dentry->d_parent->d_inode;
                 fid->parent_objectid = BTRFS_I(parent)->location.objectid;
                 fid->parent_gen = parent->i_generation;
                 parent_root_id = BTRFS_I(parent)->root->objectid;
  
-               spin_unlock(&dentry->d_lock);
-
                 if (parent_root_id != fid->root_objectid) {
                         fid->parent_root_objectid = parent_root_id;
                         len = BTRFS_FID_SIZE_CONNECTABLE_ROOT;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c

index 49fd7b66d57b272c7aeaea7db4b1bbd0985f8aa2..4b5a1e1bdefbe095c239b464e55c9699e865175b 100644 (file)
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3578,7 +3578,7 @@ again:
         space_info->chunk_alloc = 0;
         spin_unlock(&space_info->lock);
  out:
-       mutex_unlock(&extent_root->fs_info->chunk_mutex);
+       mutex_unlock(&fs_info->chunk_mutex);
         return ret;
  }
  
@@ -4355,10 +4355,9 @@ static unsigned drop_outstanding_extent(struct inode *inode)
         BTRFS_I(inode)->outstanding_extents--;
  
         if (BTRFS_I(inode)->outstanding_extents == 0 &&
-           BTRFS_I(inode)->delalloc_meta_reserved) {
+           test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                              &BTRFS_I(inode)->runtime_flags))
                 drop_inode_space = 1;
-               BTRFS_I(inode)->delalloc_meta_reserved = 0;
-       }
  
         /*
          * If we have more or the same amount of outsanding extents than we have
@@ -4465,7 +4464,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
          * Add an item to reserve for updating the inode when we complete the
          * delalloc io.
          */
-       if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+       if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                     &BTRFS_I(inode)->runtime_flags)) {
                 nr_extents++;
                 extra_reserve = 1;
         }
@@ -4511,7 +4511,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
  
         spin_lock(&BTRFS_I(inode)->lock);
         if (extra_reserve) {
-               BTRFS_I(inode)->delalloc_meta_reserved = 1;
+               set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+                       &BTRFS_I(inode)->runtime_flags);
                 nr_extents--;
         }
         BTRFS_I(inode)->reserved_extents += nr_extents;
@@ -5217,7 +5218,7 @@ out:
  void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root,
                            struct extent_buffer *buf,
-                          u64 parent, int last_ref, int for_cow)
+                          u64 parent, int last_ref)
  {
         struct btrfs_block_group_cache *cache = NULL;
         int ret;
@@ -5227,7 +5228,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                                         buf->start, buf->len,
                                         parent, root->root_key.objectid,
                                         btrfs_header_level(buf),
-                                       BTRFS_DROP_DELAYED_REF, NULL, for_cow);
+                                       BTRFS_DROP_DELAYED_REF, NULL, 0);
                 BUG_ON(ret); /* -ENOMEM */
         }
  
@@ -6249,7 +6250,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root, u32 blocksize,
                                         u64 parent, u64 root_objectid,
                                         struct btrfs_disk_key *key, int level,
-                                       u64 hint, u64 empty_size, int for_cow)
+                                       u64 hint, u64 empty_size)
  {
         struct btrfs_key ins;
         struct btrfs_block_rsv *block_rsv;
@@ -6297,7 +6298,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                         ins.objectid,
                                         ins.offset, parent, root_objectid,
                                         level, BTRFS_ADD_DELAYED_EXTENT,
-                                       extent_op, for_cow);
+                                       extent_op, 0);
                 BUG_ON(ret); /* -ENOMEM */
         }
         return buf;
@@ -6715,7 +6716,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                btrfs_header_owner(path->nodes[level + 1]));
         }
  
-       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1, 0);
+       btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
  out:
         wc->refs[level] = 0;
         wc->flags[level] = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c

index c9018a05036e943a52ad91d81019bb4b934b6b9a..2c8f7b2046173954f720125a6e53e96de3c7727e 100644 (file)
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -186,7 +186,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
                         return parent;
         }
  
-       entry = rb_entry(node, struct tree_entry, rb_node);
         rb_link_node(node, parent, p);
         rb_insert_color(node, root);
         return NULL;
@@ -413,7 +412,7 @@ static struct extent_state *next_state(struct extent_state *state)
  
  /*
   * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1)
+ * it will optionally wake up any one waiting on this state (wake == 1).
   *
   * If no bits are set on the state struct after clearing things, the
   * struct is freed and removed from the tree
@@ -570,10 +569,8 @@ hit_next:
                 if (err)
                         goto out;
                 if (state->end <= end) {
-                       clear_state_bit(tree, state, &bits, wake);
-                       if (last_end == (u64)-1)
-                               goto out;
-                       start = last_end + 1;
+                       state = clear_state_bit(tree, state, &bits, wake);
+                       goto next;
                 }
                 goto search_again;
         }
@@ -781,7 +778,6 @@ hit_next:
          * Just lock what we found and keep going
          */
         if (state->start == start && state->end <= end) {
-               struct rb_node *next_node;
                 if (state->state & exclusive_bits) {
                         *failed_start = state->start;
                         err = -EEXIST;
@@ -789,20 +785,15 @@ hit_next:
                 }
  
                 set_state_bits(tree, state, &bits);
-
                 cache_state(state, cached_state);
                 merge_state(tree, state);
                 if (last_end == (u64)-1)
                         goto out;
-
                 start = last_end + 1;
-               next_node = rb_next(&state->rb_node);
-               if (next_node && start < end && prealloc && !need_resched()) {
-                       state = rb_entry(next_node, struct extent_state,
-                                        rb_node);
-                       if (state->start == start)
-                               goto hit_next;
-               }
+               state = next_state(state);
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
                 goto search_again;
         }
  
@@ -845,6 +836,10 @@ hit_next:
                         if (last_end == (u64)-1)
                                 goto out;
                         start = last_end + 1;
+                       state = next_state(state);
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
                 }
                 goto search_again;
         }
@@ -994,21 +989,14 @@ hit_next:
          * Just lock what we found and keep going
          */
         if (state->start == start && state->end <= end) {
-               struct rb_node *next_node;
-
                 set_state_bits(tree, state, &bits);
-               clear_state_bit(tree, state, &clear_bits, 0);
+               state = clear_state_bit(tree, state, &clear_bits, 0);
                 if (last_end == (u64)-1)
                         goto out;
-
                 start = last_end + 1;
-               next_node = rb_next(&state->rb_node);
-               if (next_node && start < end && prealloc && !need_resched()) {
-                       state = rb_entry(next_node, struct extent_state,
-                                        rb_node);
-                       if (state->start == start)
-                               goto hit_next;
-               }
+               if (start < end && state && state->start == start &&
+                   !need_resched())
+                       goto hit_next;
                 goto search_again;
         }
  
@@ -1042,10 +1030,13 @@ hit_next:
                         goto out;
                 if (state->end <= end) {
                         set_state_bits(tree, state, &bits);
-                       clear_state_bit(tree, state, &clear_bits, 0);
+                       state = clear_state_bit(tree, state, &clear_bits, 0);
                         if (last_end == (u64)-1)
                                 goto out;
                         start = last_end + 1;
+                       if (start < end && state && state->start == start &&
+                           !need_resched())
+                               goto hit_next;
                 }
                 goto search_again;
         }
@@ -1173,9 +1164,8 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                               cached_state, mask);
  }
  
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-                                u64 end, struct extent_state **cached_state,
-                                gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                         struct extent_state **cached_state, gfp_t mask)
  {
         return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
                                 cached_state, mask);
@@ -1293,7 +1283,7 @@ out:
   * returned if we find something, and *start_ret and *end_ret are
   * set to reflect the state struct that was found.
   *
- * If nothing was found, 1 is returned, < 0 on error
+ * If nothing was found, 1 is returned. If found something, return 0.
   */
  int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
                           u64 *start_ret, u64 *end_ret, int bits)
@@ -1923,6 +1913,7 @@ int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
         if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
                 /* try to remap that extent elsewhere? */
                 bio_put(bio);
+               btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
                 return -EIO;
         }
  
@@ -2222,17 +2213,7 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
                         uptodate = 0;
         }
  
-       if (!uptodate && tree->ops &&
-           tree->ops->writepage_io_failed_hook) {
-               ret = tree->ops->writepage_io_failed_hook(NULL, page,
-                                                start, end, NULL);
-               /* Writeback already completed */
-               if (ret == 0)
-                       return 1;
-       }
-
         if (!uptodate) {
-               clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
                 ClearPageUptodate(page);
                 SetPageError(page);
         }
@@ -2347,10 +2328,23 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                         ret = tree->ops->readpage_end_io_hook(page, start, end,
                                                               state, mirror);
-                       if (ret)
+                       if (ret) {
+                               /* no IO indicated but software detected errors
+                                * in the block, either checksum errors or
+                                * issues with the contents */
+                               struct btrfs_root *root =
+                                       BTRFS_I(page->mapping->host)->root;
+                               struct btrfs_device *device;
+
                                 uptodate = 0;
-                       else
+                               device = btrfs_find_device_for_logical(
+                                               root, start, mirror);
+                               if (device)
+                                       btrfs_dev_stat_inc_and_print(device,
+                                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
+                       } else {
                                 clean_io_failure(start, page);
+                       }
                 }
  
                 if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
@@ -3164,7 +3158,7 @@ static int write_one_eb(struct extent_buffer *eb,
         u64 offset = eb->start;
         unsigned long i, num_pages;
         int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
-       int ret;
+       int ret = 0;
  
         clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
         num_pages = num_extent_pages(eb->start, eb->len);
@@ -3930,6 +3924,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
         eb->start = start;
         eb->len = len;
         eb->tree = tree;
+       eb->bflags = 0;
         rwlock_init(&eb->lock);
         atomic_set(&eb->write_locks, 0);
         atomic_set(&eb->read_locks, 0);
@@ -3967,6 +3962,60 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
         return eb;
  }
  
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
+{
+       unsigned long i;
+       struct page *p;
+       struct extent_buffer *new;
+       unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+       new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
+       if (new == NULL)
+               return NULL;
+
+       for (i = 0; i < num_pages; i++) {
+               p = alloc_page(GFP_ATOMIC);
+               BUG_ON(!p);
+               attach_extent_buffer_page(new, p);
+               WARN_ON(PageDirty(p));
+               SetPageUptodate(p);
+               new->pages[i] = p;
+       }
+
+       copy_extent_buffer(new, src, 0, 0, src->len);
+       set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+       set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+       return new;
+}
+
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+{
+       struct extent_buffer *eb;
+       unsigned long num_pages = num_extent_pages(0, len);
+       unsigned long i;
+
+       eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
+       if (!eb)
+               return NULL;
+
+       for (i = 0; i < num_pages; i++) {
+               eb->pages[i] = alloc_page(GFP_ATOMIC);
+               if (!eb->pages[i])
+                       goto err;
+       }
+       set_extent_buffer_uptodate(eb);
+       btrfs_set_header_nritems(eb, 0);
+       set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+       return eb;
+err:
+       for (i--; i > 0; i--)
+               __free_page(eb->pages[i]);
+       __free_extent_buffer(eb);
+       return NULL;
+}
+
  static int extent_buffer_under_io(struct extent_buffer *eb)
  {
         return (atomic_read(&eb->io_pages) ||
@@ -3981,18 +4030,21 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                                                 unsigned long start_idx)
  {
         unsigned long index;
+       unsigned long num_pages;
         struct page *page;
+       int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
  
         BUG_ON(extent_buffer_under_io(eb));
  
-       index = num_extent_pages(eb->start, eb->len);
+       num_pages = num_extent_pages(eb->start, eb->len);
+       index = start_idx + num_pages;
         if (start_idx >= index)
                 return;
  
         do {
                 index--;
                 page = extent_buffer_page(eb, index);
-               if (page) {
+               if (page && mapped) {
                         spin_lock(&page->mapping->private_lock);
                         /*
                          * We do this since we'll remove the pages after we've
@@ -4017,6 +4069,8 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
                         }
                         spin_unlock(&page->mapping->private_lock);
  
+               }
+               if (page) {
                         /* One for when we alloced the page */
                         page_cache_release(page);
                 }
@@ -4235,14 +4289,18 @@ static void release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
  {
         WARN_ON(atomic_read(&eb->refs) == 0);
         if (atomic_dec_and_test(&eb->refs)) {
-               struct extent_io_tree *tree = eb->tree;
+               if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+                       spin_unlock(&eb->refs_lock);
+               } else {
+                       struct extent_io_tree *tree = eb->tree;
  
-               spin_unlock(&eb->refs_lock);
+                       spin_unlock(&eb->refs_lock);
  
-               spin_lock(&tree->buffer_lock);
-               radix_tree_delete(&tree->buffer,
-                                 eb->start >> PAGE_CACHE_SHIFT);
-               spin_unlock(&tree->buffer_lock);
+                       spin_lock(&tree->buffer_lock);
+                       radix_tree_delete(&tree->buffer,
+                                         eb->start >> PAGE_CACHE_SHIFT);
+                       spin_unlock(&tree->buffer_lock);
+               }
  
                 /* Should be safe to release our pages at this point */
                 btrfs_release_extent_buffer_page(eb, 0);
@@ -4259,6 +4317,10 @@ void free_extent_buffer(struct extent_buffer *eb)
                 return;
  
         spin_lock(&eb->refs_lock);
+       if (atomic_read(&eb->refs) == 2 &&
+           test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+               atomic_dec(&eb->refs);
+
         if (atomic_read(&eb->refs) == 2 &&
             test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
             !extent_buffer_under_io(eb) &&
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h

index b516c3b8dec68d825e380a1930976f34c8a3e1a4..25900af5b15d43e6bdfe0cef7c865ac2aa81bd36 100644 (file)
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -39,6 +39,7 @@
  #define EXTENT_BUFFER_STALE 6
  #define EXTENT_BUFFER_WRITEBACK 7
  #define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
  
  /* these are flags for extent_clear_unlock_delalloc */
  #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -75,9 +76,6 @@ struct extent_io_ops {
                               unsigned long bio_flags);
         int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
         int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
-       int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
-                                       u64 start, u64 end,
-                                      struct extent_state *state);
         int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
                                     struct extent_state *state, int mirror);
         int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
@@ -225,6 +223,8 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                    struct extent_state **cached_state, gfp_t mask);
  int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+                         struct extent_state **cached_state, gfp_t mask);
  int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                    gfp_t mask);
  int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -265,6 +265,8 @@ void set_page_extent_mapped(struct page *page);
  
  struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                           u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
  struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len);
  void free_extent_buffer(struct extent_buffer *eb);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c

index 53bf2d764bbc4f5814db04710d3123d03c3779ba..70dc8ca73e257bc3a1e7a96ea48009bff093af9a 100644 (file)
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -65,6 +65,21 @@ struct inode_defrag {
         int cycled;
  };
  
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+                                 struct inode_defrag *defrag2)
+{
+       if (defrag1->root > defrag2->root)
+               return 1;
+       else if (defrag1->root < defrag2->root)
+               return -1;
+       else if (defrag1->ino > defrag2->ino)
+               return 1;
+       else if (defrag1->ino < defrag2->ino)
+               return -1;
+       else
+               return 0;
+}
+
  /* pop a record for an inode into the defrag tree.  The lock
   * must be held already
   *
@@ -81,15 +96,17 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
         struct inode_defrag *entry;
         struct rb_node **p;
         struct rb_node *parent = NULL;
+       int ret;
  
         p = &root->fs_info->defrag_inodes.rb_node;
         while (*p) {
                 parent = *p;
                 entry = rb_entry(parent, struct inode_defrag, rb_node);
  
-               if (defrag->ino < entry->ino)
+               ret = __compare_inode_defrag(defrag, entry);
+               if (ret < 0)
                         p = &parent->rb_left;
-               else if (defrag->ino > entry->ino)
+               else if (ret > 0)
                         p = &parent->rb_right;
                 else {
                         /* if we're reinserting an entry for
@@ -103,7 +120,7 @@ static void __btrfs_add_inode_defrag(struct inode *inode,
                         goto exists;
                 }
         }
-       BTRFS_I(inode)->in_defrag = 1;
+       set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
         rb_link_node(&defrag->rb_node, parent, p);
         rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
         return;
@@ -131,7 +148,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
         if (btrfs_fs_closing(root->fs_info))
                 return 0;
  
-       if (BTRFS_I(inode)->in_defrag)
+       if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
                 return 0;
  
         if (trans)
@@ -148,7 +165,7 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
         defrag->root = root->root_key.objectid;
  
         spin_lock(&root->fs_info->defrag_inodes_lock);
-       if (!BTRFS_I(inode)->in_defrag)
+       if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
                 __btrfs_add_inode_defrag(inode, defrag);
         else
                 kfree(defrag);
@@ -159,28 +176,35 @@ int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
  /*
   * must be called with the defrag_inodes lock held
   */
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info,
+                                            u64 root, u64 ino,
                                              struct rb_node **next)
  {
         struct inode_defrag *entry = NULL;
+       struct inode_defrag tmp;
         struct rb_node *p;
         struct rb_node *parent = NULL;
+       int ret;
+
+       tmp.ino = ino;
+       tmp.root = root;
  
         p = info->defrag_inodes.rb_node;
         while (p) {
                 parent = p;
                 entry = rb_entry(parent, struct inode_defrag, rb_node);
  
-               if (ino < entry->ino)
+               ret = __compare_inode_defrag(&tmp, entry);
+               if (ret < 0)
                         p = parent->rb_left;
-               else if (ino > entry->ino)
+               else if (ret > 0)
                         p = parent->rb_right;
                 else
                         return entry;
         }
  
         if (next) {
-               while (parent && ino > entry->ino) {
+               while (parent && __compare_inode_defrag(&tmp, entry) > 0) {
                         parent = rb_next(parent);
                         entry = rb_entry(parent, struct inode_defrag, rb_node);
                 }
@@ -202,6 +226,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
         struct btrfs_key key;
         struct btrfs_ioctl_defrag_range_args range;
         u64 first_ino = 0;
+       u64 root_objectid = 0;
         int num_defrag;
         int defrag_batch = 1024;
  
@@ -214,11 +239,14 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                 n = NULL;
  
                 /* find an inode to defrag */
-               defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+               defrag = btrfs_find_defrag_inode(fs_info, root_objectid,
+                                                first_ino, &n);
                 if (!defrag) {
-                       if (n)
-                               defrag = rb_entry(n, struct inode_defrag, rb_node);
-                       else if (first_ino) {
+                       if (n) {
+                               defrag = rb_entry(n, struct inode_defrag,
+                                                 rb_node);
+                       } else if (root_objectid || first_ino) {
+                               root_objectid = 0;
                                 first_ino = 0;
                                 continue;
                         } else {
@@ -228,6 +256,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
  
                 /* remove it from the rbtree */
                 first_ino = defrag->ino + 1;
+               root_objectid = defrag->root;
                 rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
  
                 if (btrfs_fs_closing(fs_info))
@@ -252,7 +281,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
                         goto next;
  
                 /* do a chunk of defrag */
-               BTRFS_I(inode)->in_defrag = 0;
+               clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
                 range.start = defrag->last_offset;
                 num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
                                                defrag_batch);
@@ -1404,12 +1433,11 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                 goto out;
         }
  
-       err = btrfs_update_time(file);
+       err = file_update_time(file);
         if (err) {
                 mutex_unlock(&inode->i_mutex);
                 goto out;
         }
-       BTRFS_I(inode)->sequence++;
  
         start_pos = round_down(pos, root->sectorsize);
         if (start_pos > i_size_read(inode)) {
@@ -1466,8 +1494,8 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
          * flush down new bytes that may have been written if the
          * application were using truncate to replace a file in place.
          */
-       if (BTRFS_I(inode)->ordered_data_close) {
-               BTRFS_I(inode)->ordered_data_close = 0;
+       if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                              &BTRFS_I(inode)->runtime_flags)) {
                 btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
                 if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                         filemap_flush(inode->i_mapping);
@@ -1498,14 +1526,15 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
  
         trace_btrfs_sync_file(file, datasync);
  
-       ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-       if (ret)
-               return ret;
         mutex_lock(&inode->i_mutex);
  
-       /* we wait first, since the writeback may change the inode */
+       /*
+        * we wait first, since the writeback may change the inode, also wait
+        * ordered range does a filemape_write_and_wait_range which is why we
+        * don't do it above like other file systems.
+        */
         root->log_batch++;
-       btrfs_wait_ordered_range(inode, 0, (u64)-1);
+       btrfs_wait_ordered_range(inode, start, end);
         root->log_batch++;
  
         /*
@@ -1523,7 +1552,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
          * syncing
          */
         smp_mb();
-       if (BTRFS_I(inode)->last_trans <=
+       if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+           BTRFS_I(inode)->last_trans <=
             root->fs_info->last_trans_committed) {
                 BTRFS_I(inode)->last_trans = 0;
                 mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c

index 202008ec367d4c4c2cfcf73f7289692dd910b25c..81296c57405a5d53a27dba626a4d6201829bd578 100644 (file)
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -33,6 +33,8 @@
  
  static int link_free_space(struct btrfs_free_space_ctl *ctl,
                            struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+                             struct btrfs_free_space *info);
  
  static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
                                                struct btrfs_path *path,
@@ -75,7 +77,8 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
                 return ERR_PTR(-ENOENT);
         }
  
-       inode->i_mapping->flags &= ~__GFP_FS;
+       mapping_set_gfp_mask(inode->i_mapping,
+                       mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
  
         return inode;
  }
@@ -365,7 +368,7 @@ static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
  
  static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
  {
-       u64 *val;
+       __le64 *val;
  
         io_ctl_map_page(io_ctl, 1);
  
@@ -388,7 +391,7 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
  
  static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
  {
-       u64 *gen;
+       __le64 *gen;
  
         /*
          * Skip the crc area.  If we don't check crcs then we just have a 64bit
@@ -584,6 +587,44 @@ static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
         return 0;
  }
  
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries.  This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache.  So run through the space cache that we just
+ * loaded and merge contiguous entries.  This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+       struct btrfs_free_space *e, *prev = NULL;
+       struct rb_node *n;
+
+again:
+       spin_lock(&ctl->tree_lock);
+       for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+               e = rb_entry(n, struct btrfs_free_space, offset_index);
+               if (!prev)
+                       goto next;
+               if (e->bitmap || prev->bitmap)
+                       goto next;
+               if (prev->offset + prev->bytes == e->offset) {
+                       unlink_free_space(ctl, prev);
+                       unlink_free_space(ctl, e);
+                       prev->bytes += e->bytes;
+                       kmem_cache_free(btrfs_free_space_cachep, e);
+                       link_free_space(ctl, prev);
+                       prev = NULL;
+                       spin_unlock(&ctl->tree_lock);
+                       goto again;
+               }
+next:
+               prev = e;
+       }
+       spin_unlock(&ctl->tree_lock);
+}
+
  int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                             struct btrfs_free_space_ctl *ctl,
                             struct btrfs_path *path, u64 offset)
@@ -726,6 +767,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
         }
  
         io_ctl_drop_pages(&io_ctl);
+       merge_space_tree(ctl);
         ret = 1;
  out:
         io_ctl_free(&io_ctl);
@@ -972,9 +1014,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                 goto out;
  
  
-       ret = filemap_write_and_wait(inode->i_mapping);
-       if (ret)
-               goto out;
+       btrfs_wait_ordered_range(inode, 0, (u64)-1);
  
         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
         key.offset = offset;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index ceb7b9c9edcc1436693178fd6d2ff62f2334ada7..f6ab6f5e635a39b18ddb7f259bf5f0edd25d10a0 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -89,7 +89,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
  
  static int btrfs_setsize(struct inode *inode, loff_t newsize);
  static int btrfs_truncate(struct inode *inode);
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  static noinline int cow_file_range(struct inode *inode,
                                    struct page *locked_page,
                                    u64 start, u64 end, int *page_started,
@@ -257,10 +257,13 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
         ret = insert_inline_extent(trans, root, inode, start,
                                    inline_len, compressed_size,
                                    compress_type, compressed_pages);
-       if (ret) {
+       if (ret && ret != -ENOSPC) {
                 btrfs_abort_transaction(trans, root, ret);
                 return ret;
+       } else if (ret == -ENOSPC) {
+               return 1;
         }
+
         btrfs_delalloc_release_metadata(inode, end + 1 - start);
         btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
         return 0;
@@ -1572,11 +1575,11 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         if (btrfs_is_free_space_inode(root, inode))
                 metadata = 2;
  
-       ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
-       if (ret)
-               return ret;
-
         if (!(rw & REQ_WRITE)) {
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+               if (ret)
+                       return ret;
+
                 if (bio_flags & EXTENT_BIO_COMPRESSED) {
                         return btrfs_submit_compressed_read(inode, bio,
                                                     mirror_num, bio_flags);
@@ -1815,25 +1818,24 @@ out:
   * an ordered extent if the range of bytes in the file it covers are
   * fully written.
   */
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
  {
+       struct inode *inode = ordered_extent->inode;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct btrfs_trans_handle *trans = NULL;
-       struct btrfs_ordered_extent *ordered_extent = NULL;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         struct extent_state *cached_state = NULL;
         int compress_type = 0;
         int ret;
         bool nolock;
  
-       ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
-                                            end - start + 1);
-       if (!ret)
-               return 0;
-       BUG_ON(!ordered_extent); /* Logic error */
-
         nolock = btrfs_is_free_space_inode(root, inode);
  
+       if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+               ret = -EIO;
+               goto out;
+       }
+
         if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                 BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
                 ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -1889,12 +1891,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                                    ordered_extent->file_offset,
                                    ordered_extent->len);
         }
-       unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                            ordered_extent->file_offset +
-                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
+
         if (ret < 0) {
                 btrfs_abort_transaction(trans, root, ret);
-               goto out;
+               goto out_unlock;
         }
  
         add_pending_csums(trans, inode, ordered_extent->file_offset,
@@ -1905,10 +1905,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                 ret = btrfs_update_inode_fallback(trans, root, inode);
                 if (ret) { /* -ENOMEM or corruption */
                         btrfs_abort_transaction(trans, root, ret);
-                       goto out;
+                       goto out_unlock;
                 }
         }
         ret = 0;
+out_unlock:
+       unlock_extent_cached(io_tree, ordered_extent->file_offset,
+                            ordered_extent->file_offset +
+                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
  out:
         if (root != root->fs_info->tree_root)
                 btrfs_delalloc_release_metadata(inode, ordered_extent->len);
@@ -1919,26 +1923,57 @@ out:
                         btrfs_end_transaction(trans, root);
         }
  
+       if (ret)
+               clear_extent_uptodate(io_tree, ordered_extent->file_offset,
+                                     ordered_extent->file_offset +
+                                     ordered_extent->len - 1, NULL, GFP_NOFS);
+
+       /*
+        * This needs to be dont to make sure anybody waiting knows we are done
+        * upating everything for this ordered extent.
+        */
+       btrfs_remove_ordered_extent(inode, ordered_extent);
+
         /* once for us */
         btrfs_put_ordered_extent(ordered_extent);
         /* once for the tree */
         btrfs_put_ordered_extent(ordered_extent);
  
-       return 0;
-out_unlock:
-       unlock_extent_cached(io_tree, ordered_extent->file_offset,
-                            ordered_extent->file_offset +
-                            ordered_extent->len - 1, &cached_state, GFP_NOFS);
-       goto out;
+       return ret;
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+       struct btrfs_ordered_extent *ordered_extent;
+       ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+       btrfs_finish_ordered_io(ordered_extent);
  }
  
  static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                 struct extent_state *state, int uptodate)
  {
+       struct inode *inode = page->mapping->host;
+       struct btrfs_root *root = BTRFS_I(inode)->root;
+       struct btrfs_ordered_extent *ordered_extent = NULL;
+       struct btrfs_workers *workers;
+
         trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
  
         ClearPagePrivate2(page);
-       return btrfs_finish_ordered_io(page->mapping->host, start, end);
+       if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+                                           end - start + 1, uptodate))
+               return 0;
+
+       ordered_extent->work.func = finish_ordered_fn;
+       ordered_extent->work.flags = 0;
+
+       if (btrfs_is_free_space_inode(root, inode))
+               workers = &root->fs_info->endio_freespace_worker;
+       else
+               workers = &root->fs_info->endio_write_workers;
+       btrfs_queue_worker(workers, &ordered_extent->work);
+
+       return 0;
  }
  
  /*
@@ -2072,12 +2107,12 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
         struct btrfs_block_rsv *block_rsv;
         int ret;
  
-       if (!list_empty(&root->orphan_list) ||
+       if (atomic_read(&root->orphan_inodes) ||
             root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
                 return;
  
         spin_lock(&root->orphan_lock);
-       if (!list_empty(&root->orphan_list)) {
+       if (atomic_read(&root->orphan_inodes)) {
                 spin_unlock(&root->orphan_lock);
                 return;
         }
@@ -2134,8 +2169,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                 block_rsv = NULL;
         }
  
-       if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+       if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                             &BTRFS_I(inode)->runtime_flags)) {
  #if 0
                 /*
                  * For proper ENOSPC handling, we should do orphan
@@ -2148,12 +2183,12 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                         insert = 1;
  #endif
                 insert = 1;
+               atomic_dec(&root->orphan_inodes);
         }
  
-       if (!BTRFS_I(inode)->orphan_meta_reserved) {
-               BTRFS_I(inode)->orphan_meta_reserved = 1;
+       if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+                             &BTRFS_I(inode)->runtime_flags))
                 reserve = 1;
-       }
         spin_unlock(&root->orphan_lock);
  
         /* grab metadata reservation from transaction handle */
@@ -2166,6 +2201,8 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
         if (insert >= 1) {
                 ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
                 if (ret && ret != -EEXIST) {
+                       clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                                 &BTRFS_I(inode)->runtime_flags);
                         btrfs_abort_transaction(trans, root, ret);
                         return ret;
                 }
@@ -2196,15 +2233,13 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
         int ret = 0;
  
         spin_lock(&root->orphan_lock);
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-               list_del_init(&BTRFS_I(inode)->i_orphan);
+       if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                              &BTRFS_I(inode)->runtime_flags))
                 delete_item = 1;
-       }
  
-       if (BTRFS_I(inode)->orphan_meta_reserved) {
-               BTRFS_I(inode)->orphan_meta_reserved = 0;
+       if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+                              &BTRFS_I(inode)->runtime_flags))
                 release_rsv = 1;
-       }
         spin_unlock(&root->orphan_lock);
  
         if (trans && delete_item) {
@@ -2212,8 +2247,10 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
                 BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
         }
  
-       if (release_rsv)
+       if (release_rsv) {
                 btrfs_orphan_release_metadata(inode);
+               atomic_dec(&root->orphan_inodes);
+       }
  
         return 0;
  }
@@ -2341,6 +2378,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                 ret = PTR_ERR(trans);
                                 goto out;
                         }
+                       printk(KERN_ERR "auto deleting %Lu\n",
+                              found_key.objectid);
                         ret = btrfs_del_orphan_item(trans, root,
                                                     found_key.objectid);
                         BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
@@ -2352,9 +2391,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                  * add this inode to the orphan list so btrfs_orphan_del does
                  * the proper thing when we hit it
                  */
-               spin_lock(&root->orphan_lock);
-               list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-               spin_unlock(&root->orphan_lock);
+               set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                       &BTRFS_I(inode)->runtime_flags);
  
                 /* if we have links, this was a truncate, lets do that */
                 if (inode->i_nlink) {
@@ -2510,7 +2548,7 @@ static void btrfs_read_locked_inode(struct inode *inode)
  
         inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
         BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
-       BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+       inode->i_version = btrfs_inode_sequence(leaf, inode_item);
         inode->i_generation = BTRFS_I(inode)->generation;
         inode->i_rdev = 0;
         rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2594,7 +2632,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
  
         btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
         btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-       btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
+       btrfs_set_inode_sequence(leaf, item, inode->i_version);
         btrfs_set_inode_transid(leaf, item, trans->transid);
         btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
         btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
@@ -2752,6 +2790,8 @@ err:
                 goto out;
  
         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       inode_inc_iversion(inode);
+       inode_inc_iversion(dir);
         inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
         btrfs_update_inode(trans, root, dir);
  out:
@@ -3089,6 +3129,7 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
         }
  
         btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+       inode_inc_iversion(dir);
         dir->i_mtime = dir->i_ctime = CURRENT_TIME;
         ret = btrfs_update_inode(trans, root, dir);
         if (ret)
@@ -3607,7 +3648,8 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                  * any new writes get down to disk quickly.
                  */
                 if (newsize == 0)
-                       BTRFS_I(inode)->ordered_data_close = 1;
+                       set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                               &BTRFS_I(inode)->runtime_flags);
  
                 /* we don't support swapfiles, so vmtruncate shouldn't fail */
                 truncate_setsize(inode, newsize);
@@ -3638,6 +3680,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
  
         if (attr->ia_valid) {
                 setattr_copy(inode, attr);
+               inode_inc_iversion(inode);
                 err = btrfs_dirty_inode(inode);
  
                 if (!err && attr->ia_valid & ATTR_MODE)
@@ -3671,7 +3714,8 @@ void btrfs_evict_inode(struct inode *inode)
         btrfs_wait_ordered_range(inode, 0, (u64)-1);
  
         if (root->fs_info->log_root_recovering) {
-               BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+               BUG_ON(!test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                                &BTRFS_I(inode)->runtime_flags));
                 goto no_delete;
         }
  
@@ -4066,7 +4110,7 @@ static struct inode *new_simple_dir(struct super_block *s,
  
         BTRFS_I(inode)->root = root;
         memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
-       BTRFS_I(inode)->dummy_inode = 1;
+       set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
  
         inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
         inode->i_op = &btrfs_dir_ro_inode_operations;
@@ -4370,7 +4414,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
         int ret = 0;
         bool nolock = false;
  
-       if (BTRFS_I(inode)->dummy_inode)
+       if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
                 return 0;
  
         if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
@@ -4403,7 +4447,7 @@ int btrfs_dirty_inode(struct inode *inode)
         struct btrfs_trans_handle *trans;
         int ret;
  
-       if (BTRFS_I(inode)->dummy_inode)
+       if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
                 return 0;
  
         trans = btrfs_join_transaction(root);
@@ -4431,46 +4475,18 @@ int btrfs_dirty_inode(struct inode *inode)
   * This is a copy of file_update_time.  We need this so we can return error on
   * ENOSPC for updating the inode in the case of file write and mmap writes.
   */
-int btrfs_update_time(struct file *file)
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+                            int flags)
  {
-       struct inode *inode = file->f_path.dentry->d_inode;
-       struct timespec now;
-       int ret;
-       enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
-
-       /* First try to exhaust all avenues to not sync */
-       if (IS_NOCMTIME(inode))
-               return 0;
-
-       now = current_fs_time(inode->i_sb);
-       if (!timespec_equal(&inode->i_mtime, &now))
-               sync_it = S_MTIME;
-
-       if (!timespec_equal(&inode->i_ctime, &now))
-               sync_it |= S_CTIME;
-
-       if (IS_I_VERSION(inode))
-               sync_it |= S_VERSION;
-
-       if (!sync_it)
-               return 0;
-
-       /* Finally allowed to write? Takes lock. */
-       if (mnt_want_write_file(file))
-               return 0;
-
-       /* Only change inode inside the lock region */
-       if (sync_it & S_VERSION)
+       if (flags & S_VERSION)
                 inode_inc_iversion(inode);
-       if (sync_it & S_CTIME)
-               inode->i_ctime = now;
-       if (sync_it & S_MTIME)
-               inode->i_mtime = now;
-       ret = btrfs_dirty_inode(inode);
-       if (!ret)
-               mark_inode_dirty_sync(inode);
-       mnt_drop_write(file->f_path.mnt);
-       return ret;
+       if (flags & S_CTIME)
+               inode->i_ctime = *now;
+       if (flags & S_MTIME)
+               inode->i_mtime = *now;
+       if (flags & S_ATIME)
+               inode->i_atime = *now;
+       return btrfs_dirty_inode(inode);
  }
  
  /*
@@ -4730,6 +4746,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
  
         btrfs_i_size_write(parent_inode, parent_inode->i_size +
                            name_len * 2);
+       inode_inc_iversion(parent_inode);
         parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
         ret = btrfs_update_inode(trans, root, parent_inode);
         if (ret)
@@ -4937,6 +4954,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
         }
  
         btrfs_inc_nlink(inode);
+       inode_inc_iversion(inode);
         inode->i_ctime = CURRENT_TIME;
         ihold(inode);
  
@@ -5903,9 +5921,7 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
         struct btrfs_dio_private *dip = bio->bi_private;
         struct inode *inode = dip->inode;
         struct btrfs_root *root = BTRFS_I(inode)->root;
-       struct btrfs_trans_handle *trans;
         struct btrfs_ordered_extent *ordered = NULL;
-       struct extent_state *cached_state = NULL;
         u64 ordered_offset = dip->logical_offset;
         u64 ordered_bytes = dip->bytes;
         int ret;
@@ -5915,73 +5931,14 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
  again:
         ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
                                                    &ordered_offset,
-                                                  ordered_bytes);
+                                                  ordered_bytes, !err);
         if (!ret)
                 goto out_test;
  
-       BUG_ON(!ordered);
-
-       trans = btrfs_join_transaction(root);
-       if (IS_ERR(trans)) {
-               err = -ENOMEM;
-               goto out;
-       }
-       trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
-       if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
-               ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-               if (!ret)
-                       err = btrfs_update_inode_fallback(trans, root, inode);
-               goto out;
-       }
-
-       lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-                        ordered->file_offset + ordered->len - 1, 0,
-                        &cached_state);
-
-       if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-               ret = btrfs_mark_extent_written(trans, inode,
-                                               ordered->file_offset,
-                                               ordered->file_offset +
-                                               ordered->len);
-               if (ret) {
-                       err = ret;
-                       goto out_unlock;
-               }
-       } else {
-               ret = insert_reserved_file_extent(trans, inode,
-                                                 ordered->file_offset,
-                                                 ordered->start,
-                                                 ordered->disk_len,
-                                                 ordered->len,
-                                                 ordered->len,
-                                                 0, 0, 0,
-                                                 BTRFS_FILE_EXTENT_REG);
-               unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-                                  ordered->file_offset, ordered->len);
-               if (ret) {
-                       err = ret;
-                       WARN_ON(1);
-                       goto out_unlock;
-               }
-       }
-
-       add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-       ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-       if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-               btrfs_update_inode_fallback(trans, root, inode);
-       ret = 0;
-out_unlock:
-       unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-                            ordered->file_offset + ordered->len - 1,
-                            &cached_state, GFP_NOFS);
-out:
-       btrfs_delalloc_release_metadata(inode, ordered->len);
-       btrfs_end_transaction(trans, root);
-       ordered_offset = ordered->file_offset + ordered->len;
-       btrfs_put_ordered_extent(ordered);
-       btrfs_put_ordered_extent(ordered);
-
+       ordered->work.func = finish_ordered_fn;
+       ordered->work.flags = 0;
+       btrfs_queue_worker(&root->fs_info->endio_write_workers,
+                          &ordered->work);
  out_test:
         /*
          * our bio might span multiple ordered extents.  If we haven't
@@ -5990,12 +5947,12 @@ out_test:
         if (ordered_offset < dip->logical_offset + dip->bytes) {
                 ordered_bytes = dip->logical_offset + dip->bytes -
                         ordered_offset;
+               ordered = NULL;
                 goto again;
         }
  out_done:
         bio->bi_private = dip->private;
  
-       kfree(dip->csums);
         kfree(dip);
  
         /* If we had an error make sure to clear the uptodate flag */
@@ -6063,9 +6020,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
         int ret;
  
         bio_get(bio);
-       ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-       if (ret)
-               goto err;
+
+       if (!write) {
+               ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+               if (ret)
+                       goto err;
+       }
  
         if (skip_sum)
                 goto map;
@@ -6485,13 +6445,13 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
  
  static void btrfs_invalidatepage(struct page *page, unsigned long offset)
  {
+       struct inode *inode = page->mapping->host;
         struct extent_io_tree *tree;
         struct btrfs_ordered_extent *ordered;
         struct extent_state *cached_state = NULL;
         u64 page_start = page_offset(page);
         u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
  
-
         /*
          * we have the page locked, so new writeback can't start,
          * and the dirty bit won't be cleared while we are here.
@@ -6501,13 +6461,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
          */
         wait_on_page_writeback(page);
  
-       tree = &BTRFS_I(page->mapping->host)->io_tree;
+       tree = &BTRFS_I(inode)->io_tree;
         if (offset) {
                 btrfs_releasepage(page, GFP_NOFS);
                 return;
         }
         lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-       ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+       ordered = btrfs_lookup_ordered_extent(inode,
                                            page_offset(page));
         if (ordered) {
                 /*
@@ -6522,9 +6482,10 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                  * whoever cleared the private bit is responsible
                  * for the finish_ordered_io
                  */
-               if (TestClearPagePrivate2(page)) {
-                       btrfs_finish_ordered_io(page->mapping->host,
-                                               page_start, page_end);
+               if (TestClearPagePrivate2(page) &&
+                   btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
+                                                  PAGE_CACHE_SIZE, 1)) {
+                       btrfs_finish_ordered_io(ordered);
                 }
                 btrfs_put_ordered_extent(ordered);
                 cached_state = NULL;
@@ -6576,7 +6537,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
  
         ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
         if (!ret) {
-               ret = btrfs_update_time(vma->vm_file);
+               ret = file_update_time(vma->vm_file);
                 reserved = 1;
         }
         if (ret) {
@@ -6771,7 +6732,8 @@ static int btrfs_truncate(struct inode *inode)
          * using truncate to replace the contents of the file will
          * end up with a zero length file after a crash.
          */
-       if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+       if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+                                          &BTRFS_I(inode)->runtime_flags))
                 btrfs_add_ordered_operation(trans, root, inode);
  
         while (1) {
@@ -6894,7 +6856,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         ei->root = NULL;
         ei->space_info = NULL;
         ei->generation = 0;
-       ei->sequence = 0;
         ei->last_trans = 0;
         ei->last_sub_trans = 0;
         ei->logged_trans = 0;
@@ -6909,11 +6870,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         ei->outstanding_extents = 0;
         ei->reserved_extents = 0;
  
-       ei->ordered_data_close = 0;
-       ei->orphan_meta_reserved = 0;
-       ei->dummy_inode = 0;
-       ei->in_defrag = 0;
-       ei->delalloc_meta_reserved = 0;
+       ei->runtime_flags = 0;
         ei->force_compress = BTRFS_COMPRESS_NONE;
  
         ei->delayed_node = NULL;
@@ -6927,7 +6884,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
         mutex_init(&ei->log_mutex);
         mutex_init(&ei->delalloc_mutex);
         btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-       INIT_LIST_HEAD(&ei->i_orphan);
         INIT_LIST_HEAD(&ei->delalloc_inodes);
         INIT_LIST_HEAD(&ei->ordered_operations);
         RB_CLEAR_NODE(&ei->rb_node);
@@ -6972,13 +6928,12 @@ void btrfs_destroy_inode(struct inode *inode)
                 spin_unlock(&root->fs_info->ordered_extent_lock);
         }
  
-       spin_lock(&root->orphan_lock);
-       if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+       if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+                    &BTRFS_I(inode)->runtime_flags)) {
                 printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
                        (unsigned long long)btrfs_ino(inode));
-               list_del_init(&BTRFS_I(inode)->i_orphan);
+               atomic_dec(&root->orphan_inodes);
         }
-       spin_unlock(&root->orphan_lock);
  
         while (1) {
                 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -7193,6 +7148,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
                 btrfs_add_ordered_operation(trans, root, old_inode);
  
+       inode_inc_iversion(old_dir);
+       inode_inc_iversion(new_dir);
+       inode_inc_iversion(old_inode);
         old_dir->i_ctime = old_dir->i_mtime = ctime;
         new_dir->i_ctime = new_dir->i_mtime = ctime;
         old_inode->i_ctime = ctime;
@@ -7219,6 +7177,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         }
  
         if (new_inode) {
+               inode_inc_iversion(new_inode);
                 new_inode->i_ctime = CURRENT_TIME;
                 if (unlikely(btrfs_ino(new_inode) ==
                              BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7490,6 +7449,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                 cur_offset += ins.offset;
                 *alloc_hint = ins.objectid + ins.offset;
  
+               inode_inc_iversion(inode);
                 inode->i_ctime = CURRENT_TIME;
                 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -7647,6 +7607,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
         .permission     = btrfs_permission,
         .fiemap         = btrfs_fiemap,
         .get_acl        = btrfs_get_acl,
+       .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_special_inode_operations = {
         .getattr        = btrfs_getattr,
@@ -7657,6 +7618,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
         .listxattr      = btrfs_listxattr,
         .removexattr    = btrfs_removexattr,
         .get_acl        = btrfs_get_acl,
+       .update_time    = btrfs_update_time,
  };
  static const struct inode_operations btrfs_symlink_inode_operations = {
         .readlink       = generic_readlink,
@@ -7670,6 +7632,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
         .listxattr      = btrfs_listxattr,
         .removexattr    = btrfs_removexattr,
         .get_acl        = btrfs_get_acl,
+       .update_time    = btrfs_update_time,
  };
  
  const struct dentry_operations btrfs_dentry_operations = {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c

index 14f8e1faa46ee0478ebb83d6f82d205d25c1dc51..24b776c08d99f7bbb621076f68500464b6829435 100644 (file)
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -261,6 +261,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
         }
  
         btrfs_update_iflags(inode);
+       inode_inc_iversion(inode);
         inode->i_ctime = CURRENT_TIME;
         ret = btrfs_update_inode(trans, root, inode);
  
@@ -367,7 +368,7 @@ static noinline int create_subvol(struct btrfs_root *root,
                 return PTR_ERR(trans);
  
         leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-                                     0, objectid, NULL, 0, 0, 0, 0);
+                                     0, objectid, NULL, 0, 0, 0);
         if (IS_ERR(leaf)) {
                 ret = PTR_ERR(leaf);
                 goto fail;
@@ -2262,10 +2263,12 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
         di_args->bytes_used = dev->bytes_used;
         di_args->total_bytes = dev->total_bytes;
         memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
-       if (dev->name)
+       if (dev->name) {
                 strncpy(di_args->path, dev->name, sizeof(di_args->path));
-       else
+               di_args->path[sizeof(di_args->path) - 1] = 0;
+       } else {
                 di_args->path[0] = '\0';
+       }
  
  out:
         if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
@@ -2622,6 +2625,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                         btrfs_mark_buffer_dirty(leaf);
                         btrfs_release_path(path);
  
+                       inode_inc_iversion(inode);
                         inode->i_mtime = inode->i_ctime = CURRENT_TIME;
  
                         /*
@@ -2914,7 +2918,7 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
                 up_read(&info->groups_sem);
         }
  
-       user_dest = (struct btrfs_ioctl_space_info *)
+       user_dest = (struct btrfs_ioctl_space_info __user *)
                 (arg + sizeof(struct btrfs_ioctl_space_args));
  
         if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -3042,6 +3046,28 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
         return ret;
  }
  
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+                                     void __user *arg, int reset_after_read)
+{
+       struct btrfs_ioctl_get_dev_stats *sa;
+       int ret;
+
+       if (reset_after_read && !capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       sa = memdup_user(arg, sizeof(*sa));
+       if (IS_ERR(sa))
+               return PTR_ERR(sa);
+
+       ret = btrfs_get_dev_stats(root, sa, reset_after_read);
+
+       if (copy_to_user(arg, sa, sizeof(*sa)))
+               ret = -EFAULT;
+
+       kfree(sa);
+       return ret;
+}
+
  static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
  {
         int ret = 0;
@@ -3212,8 +3238,9 @@ void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
         }
  }
  
-static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
  {
+       struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         struct btrfs_ioctl_balance_args *bargs;
         struct btrfs_balance_control *bctl;
@@ -3225,6 +3252,10 @@ static long btrfs_ioctl_balance(struct btrfs_root *root, void __user *arg)
         if (fs_info->sb->s_flags & MS_RDONLY)
                 return -EROFS;
  
+       ret = mnt_want_write(file->f_path.mnt);
+       if (ret)
+               return ret;
+
         mutex_lock(&fs_info->volume_mutex);
         mutex_lock(&fs_info->balance_mutex);
  
@@ -3291,6 +3322,7 @@ out_bargs:
  out:
         mutex_unlock(&fs_info->balance_mutex);
         mutex_unlock(&fs_info->volume_mutex);
+       mnt_drop_write(file->f_path.mnt);
         return ret;
  }
  
@@ -3386,7 +3418,7 @@ long btrfs_ioctl(struct file *file, unsigned int
         case BTRFS_IOC_DEV_INFO:
                 return btrfs_ioctl_dev_info(root, argp);
         case BTRFS_IOC_BALANCE:
-               return btrfs_ioctl_balance(root, NULL);
+               return btrfs_ioctl_balance(file, NULL);
         case BTRFS_IOC_CLONE:
                 return btrfs_ioctl_clone(file, arg, 0, 0, 0);
         case BTRFS_IOC_CLONE_RANGE:
@@ -3419,11 +3451,15 @@ long btrfs_ioctl(struct file *file, unsigned int
         case BTRFS_IOC_SCRUB_PROGRESS:
                 return btrfs_ioctl_scrub_progress(root, argp);
         case BTRFS_IOC_BALANCE_V2:
-               return btrfs_ioctl_balance(root, argp);
+               return btrfs_ioctl_balance(file, argp);
         case BTRFS_IOC_BALANCE_CTL:
                 return btrfs_ioctl_balance_ctl(root, arg);
         case BTRFS_IOC_BALANCE_PROGRESS:
                 return btrfs_ioctl_balance_progress(root, argp);
+       case BTRFS_IOC_GET_DEV_STATS:
+               return btrfs_ioctl_get_dev_stats(root, argp, 0);
+       case BTRFS_IOC_GET_AND_RESET_DEV_STATS:
+               return btrfs_ioctl_get_dev_stats(root, argp, 1);
         }
  
         return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h

index 086e6bdae1c4482b93b6dda4d16b1c5af288f2eb..497c530724cf6b7a50296d2c6660fef7f4066cb9 100644 (file)
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -266,6 +266,35 @@ struct btrfs_ioctl_logical_ino_args {
         __u64                           inodes;
  };
  
+enum btrfs_dev_stat_values {
+       /* disk I/O failure stats */
+       BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
+       BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
+       BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
+
+       /* stats for indirect indications for I/O failures */
+       BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
+                                        * contents is illegal: this is an
+                                        * indication that the block was damaged
+                                        * during read or write, or written to
+                                        * wrong location or read from wrong
+                                        * location */
+       BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
+                                        * been written */
+
+       BTRFS_DEV_STAT_VALUES_MAX
+};
+
+struct btrfs_ioctl_get_dev_stats {
+       __u64 devid;                            /* in */
+       __u64 nr_items;                         /* in/out */
+
+       /* out values: */
+       __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
+
+       __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
+};
+
  #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
                                    struct btrfs_ioctl_vol_args)
  #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -330,5 +359,9 @@ struct btrfs_ioctl_logical_ino_args {
                                         struct btrfs_ioctl_ino_path_args)
  #define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
                                         struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
+                                     struct btrfs_ioctl_get_dev_stats)
+#define BTRFS_IOC_GET_AND_RESET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 53, \
+                                       struct btrfs_ioctl_get_dev_stats)
  
  #endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c

index bbf6d0d9aebe9b68f0ea8e5c121783d81733f7d7..9e138cdc36c5eb7d66bf80dfc37829878eeaa6e2 100644 (file)
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,7 +196,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
         entry->len = len;
         entry->disk_len = disk_len;
         entry->bytes_left = len;
-       entry->inode = inode;
+       entry->inode = igrab(inode);
         entry->compress_type = compress_type;
         if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                 set_bit(type, &entry->flags);
@@ -212,12 +212,12 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
  
         trace_btrfs_ordered_extent_add(inode, entry);
  
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
         node = tree_insert(&tree->tree, file_offset,
                            &entry->rb_node);
         if (node)
                 ordered_data_tree_panic(inode, -EEXIST, file_offset);
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
  
         spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
         list_add_tail(&entry->root_extent_list,
@@ -264,9 +264,9 @@ void btrfs_add_ordered_sum(struct inode *inode,
         struct btrfs_ordered_inode_tree *tree;
  
         tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
         list_add_tail(&sum->list, &entry->list);
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
  }
  
  /*
@@ -283,18 +283,19 @@ void btrfs_add_ordered_sum(struct inode *inode,
   */
  int btrfs_dec_test_first_ordered_pending(struct inode *inode,
                                    struct btrfs_ordered_extent **cached,
-                                  u64 *file_offset, u64 io_size)
+                                  u64 *file_offset, u64 io_size, int uptodate)
  {
         struct btrfs_ordered_inode_tree *tree;
         struct rb_node *node;
         struct btrfs_ordered_extent *entry = NULL;
         int ret;
+       unsigned long flags;
         u64 dec_end;
         u64 dec_start;
         u64 to_dec;
  
         tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irqsave(&tree->lock, flags);
         node = tree_search(tree, *file_offset);
         if (!node) {
                 ret = 1;
@@ -323,6 +324,9 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
                        (unsigned long long)to_dec);
         }
         entry->bytes_left -= to_dec;
+       if (!uptodate)
+               set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
         if (entry->bytes_left == 0)
                 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
         else
@@ -332,7 +336,7 @@ out:
                 *cached = entry;
                 atomic_inc(&entry->refs);
         }
-       spin_unlock(&tree->lock);
+       spin_unlock_irqrestore(&tree->lock, flags);
         return ret == 0;
  }
  
@@ -347,15 +351,21 @@ out:
   */
  int btrfs_dec_test_ordered_pending(struct inode *inode,
                                    struct btrfs_ordered_extent **cached,
-                                  u64 file_offset, u64 io_size)
+                                  u64 file_offset, u64 io_size, int uptodate)
  {
         struct btrfs_ordered_inode_tree *tree;
         struct rb_node *node;
         struct btrfs_ordered_extent *entry = NULL;
+       unsigned long flags;
         int ret;
  
         tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irqsave(&tree->lock, flags);
+       if (cached && *cached) {
+               entry = *cached;
+               goto have_entry;
+       }
+
         node = tree_search(tree, file_offset);
         if (!node) {
                 ret = 1;
@@ -363,6 +373,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
         }
  
         entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
         if (!offset_in_entry(entry, file_offset)) {
                 ret = 1;
                 goto out;
@@ -374,6 +385,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                        (unsigned long long)io_size);
         }
         entry->bytes_left -= io_size;
+       if (!uptodate)
+               set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
         if (entry->bytes_left == 0)
                 ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
         else
@@ -383,7 +397,7 @@ out:
                 *cached = entry;
                 atomic_inc(&entry->refs);
         }
-       spin_unlock(&tree->lock);
+       spin_unlock_irqrestore(&tree->lock, flags);
         return ret == 0;
  }
  
@@ -399,6 +413,8 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
         trace_btrfs_ordered_extent_put(entry->inode, entry);
  
         if (atomic_dec_and_test(&entry->refs)) {
+               if (entry->inode)
+                       btrfs_add_delayed_iput(entry->inode);
                 while (!list_empty(&entry->list)) {
                         cur = entry->list.next;
                         sum = list_entry(cur, struct btrfs_ordered_sum, list);
@@ -411,21 +427,22 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
  
  /*
   * remove an ordered extent from the tree.  No references are dropped
- * and you must wake_up entry->wait.  You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
   */
-static void __btrfs_remove_ordered_extent(struct inode *inode,
-                                         struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+                                struct btrfs_ordered_extent *entry)
  {
         struct btrfs_ordered_inode_tree *tree;
         struct btrfs_root *root = BTRFS_I(inode)->root;
         struct rb_node *node;
  
         tree = &BTRFS_I(inode)->ordered_tree;
+       spin_lock_irq(&tree->lock);
         node = &entry->rb_node;
         rb_erase(node, &tree->tree);
         tree->last = NULL;
         set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+       spin_unlock_irq(&tree->lock);
  
         spin_lock(&root->fs_info->ordered_extent_lock);
         list_del_init(&entry->root_extent_list);
@@ -442,21 +459,6 @@ static void __btrfs_remove_ordered_extent(struct inode *inode,
                 list_del_init(&BTRFS_I(inode)->ordered_operations);
         }
         spin_unlock(&root->fs_info->ordered_extent_lock);
-}
-
-/*
- * remove an ordered extent from the tree.  No references are dropped
- * but any waiters are woken.
- */
-void btrfs_remove_ordered_extent(struct inode *inode,
-                                struct btrfs_ordered_extent *entry)
-{
-       struct btrfs_ordered_inode_tree *tree;
-
-       tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
-       __btrfs_remove_ordered_extent(inode, entry);
-       spin_unlock(&tree->lock);
         wake_up(&entry->wait);
  }
  
@@ -621,19 +623,11 @@ void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                 if (orig_end > INT_LIMIT(loff_t))
                         orig_end = INT_LIMIT(loff_t);
         }
-again:
+
         /* start IO across the range first to instantiate any delalloc
          * extents
          */
-       filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-       /* The compression code will leave pages locked but return from
-        * writepage without setting the page writeback.  Starting again
-        * with WB_SYNC_ALL will end up waiting for the IO to actually start.
-        */
-       filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
-
-       filemap_fdatawait_range(inode->i_mapping, start, orig_end);
+       filemap_write_and_wait_range(inode->i_mapping, start, orig_end);
  
         end = orig_end;
         found = 0;
@@ -657,11 +651,6 @@ again:
                         break;
                 end--;
         }
-       if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-                          EXTENT_DELALLOC, 0, NULL)) {
-               schedule_timeout(1);
-               goto again;
-       }
  }
  
  /*
@@ -676,7 +665,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
         struct btrfs_ordered_extent *entry = NULL;
  
         tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
         node = tree_search(tree, file_offset);
         if (!node)
                 goto out;
@@ -687,7 +676,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
         if (entry)
                 atomic_inc(&entry->refs);
  out:
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
         return entry;
  }
  
@@ -703,7 +692,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
         struct btrfs_ordered_extent *entry = NULL;
  
         tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
         node = tree_search(tree, file_offset);
         if (!node) {
                 node = tree_search(tree, file_offset + len);
@@ -728,7 +717,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
  out:
         if (entry)
                 atomic_inc(&entry->refs);
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
         return entry;
  }
  
@@ -744,7 +733,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
         struct btrfs_ordered_extent *entry = NULL;
  
         tree = &BTRFS_I(inode)->ordered_tree;
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
         node = tree_search(tree, file_offset);
         if (!node)
                 goto out;
@@ -752,7 +741,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
         entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
         atomic_inc(&entry->refs);
  out:
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
         return entry;
  }
  
@@ -764,7 +753,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                 struct btrfs_ordered_extent *ordered)
  {
         struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-       struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         u64 disk_i_size;
         u64 new_i_size;
         u64 i_size_test;
@@ -779,7 +767,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
         else
                 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
  
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
         disk_i_size = BTRFS_I(inode)->disk_i_size;
  
         /* truncate file */
@@ -797,14 +785,6 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                 goto out;
         }
  
-       /*
-        * we can't update the disk_isize if there are delalloc bytes
-        * between disk_i_size and  this ordered extent
-        */
-       if (test_range_bit(io_tree, disk_i_size, offset - 1,
-                          EXTENT_DELALLOC, 0, NULL)) {
-               goto out;
-       }
         /*
          * walk backward from this ordered extent to disk_i_size.
          * if we find an ordered extent then we can't update disk i_size
@@ -825,15 +805,18 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                 }
                 node = prev;
         }
-       while (node) {
+       for (; node; node = rb_prev(node)) {
                 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+               /* We treat this entry as if it doesnt exist */
+               if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+                       continue;
                 if (test->file_offset + test->len <= disk_i_size)
                         break;
                 if (test->file_offset >= i_size)
                         break;
                 if (test->file_offset >= disk_i_size)
                         goto out;
-               node = rb_prev(node);
         }
         new_i_size = min_t(u64, offset, i_size);
  
@@ -851,43 +834,49 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                 else
                         node = rb_first(&tree->tree);
         }
-       i_size_test = 0;
-       if (node) {
-               /*
-                * do we have an area where IO might have finished
-                * between our ordered extent and the next one.
-                */
+
+       /*
+        * We are looking for an area between our current extent and the next
+        * ordered extent to update the i_size to.  There are 3 cases here
+        *
+        * 1) We don't actually have anything and we can update to i_size.
+        * 2) We have stuff but they already did their i_size update so again we
+        * can just update to i_size.
+        * 3) We have an outstanding ordered extent so the most we can update
+        * our disk_i_size to is the start of the next offset.
+        */
+       i_size_test = i_size;
+       for (; node; node = rb_next(node)) {
                 test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-               if (test->file_offset > offset)
+
+               if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+                       continue;
+               if (test->file_offset > offset) {
                         i_size_test = test->file_offset;
-       } else {
-               i_size_test = i_size;
+                       break;
+               }
         }
  
         /*
          * i_size_test is the end of a region after this ordered
-        * extent where there are no ordered extents.  As long as there
-        * are no delalloc bytes in this area, it is safe to update
-        * disk_i_size to the end of the region.
+        * extent where there are no ordered extents, we can safely set
+        * disk_i_size to this.
          */
-       if (i_size_test > offset &&
-           !test_range_bit(io_tree, offset, i_size_test - 1,
-                           EXTENT_DELALLOC, 0, NULL)) {
+       if (i_size_test > offset)
                 new_i_size = min_t(u64, i_size_test, i_size);
-       }
         BTRFS_I(inode)->disk_i_size = new_i_size;
         ret = 0;
  out:
         /*
-        * we need to remove the ordered extent with the tree lock held
-        * so that other people calling this function don't find our fully
-        * processed ordered entry and skip updating the i_size
+        * We need to do this because we can't remove ordered extents until
+        * after the i_disk_size has been updated and then the inode has been
+        * updated to reflect the change, so we need to tell anybody who finds
+        * this ordered extent that we've already done all the real work, we
+        * just haven't completed all the other work.
          */
         if (ordered)
-               __btrfs_remove_ordered_extent(inode, ordered);
-       spin_unlock(&tree->lock);
-       if (ordered)
-               wake_up(&ordered->wait);
+               set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+       spin_unlock_irq(&tree->lock);
         return ret;
  }
  
@@ -912,7 +901,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
         if (!ordered)
                 return 1;
  
-       spin_lock(&tree->lock);
+       spin_lock_irq(&tree->lock);
         list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
                 if (disk_bytenr >= ordered_sum->bytenr) {
                         num_sectors = ordered_sum->len / sectorsize;
@@ -927,7 +916,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
                 }
         }
  out:
-       spin_unlock(&tree->lock);
+       spin_unlock_irq(&tree->lock);
         btrfs_put_ordered_extent(ordered);
         return ret;
  }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h

index c355ad4dc1a66962d30557e9bbdc08ca9fc25da8..e03c560d299732cfe2114fe41d049b691a949e61 100644 (file)
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -74,6 +74,12 @@ struct btrfs_ordered_sum {
  
  #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
  
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+                                      * has done its due diligence in updating
+                                      * the isize. */
+
  struct btrfs_ordered_extent {
         /* logical offset in the file */
         u64 file_offset;
@@ -113,6 +119,8 @@ struct btrfs_ordered_extent {
  
         /* a per root list of all the pending ordered extents */
         struct list_head root_extent_list;
+
+       struct btrfs_work work;
  };
  
  
@@ -143,10 +151,11 @@ void btrfs_remove_ordered_extent(struct inode *inode,
                                 struct btrfs_ordered_extent *entry);
  int btrfs_dec_test_ordered_pending(struct inode *inode,
                                    struct btrfs_ordered_extent **cached,
-                                  u64 file_offset, u64 io_size);
+                                  u64 file_offset, u64 io_size, int uptodate);
  int btrfs_dec_test_first_ordered_pending(struct inode *inode,
                                    struct btrfs_ordered_extent **cached,
-                                  u64 *file_offset, u64 io_size);
+                                  u64 *file_offset, u64 io_size,
+                                  int uptodate);
  int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                              u64 start, u64 len, u64 disk_len, int type);
  int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c

index f38e452486b8d12ba36589248579dc158981c3be..5e23684887eb8eb401594af69b1be7372f7188aa 100644 (file)
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                                btrfs_dev_extent_chunk_offset(l, dev_extent),
                                (unsigned long long)
                                btrfs_dev_extent_length(l, dev_extent));
+               case BTRFS_DEV_STATS_KEY:
+                       printk(KERN_INFO "\t\tdevice stats\n");
+                       break;
                 };
         }
  }
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c

index ac5d010858848d007e380d529476ad9eb4f6fb31..48a4882d8ad5955eaa0be2b940e35f0b3b2a7f6f 100644 (file)
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -718,13 +718,18 @@ static void reada_start_machine_worker(struct btrfs_work *work)
  {
         struct reada_machine_work *rmw;
         struct btrfs_fs_info *fs_info;
+       int old_ioprio;
  
         rmw = container_of(work, struct reada_machine_work, work);
         fs_info = rmw->fs_info;
  
         kfree(rmw);
  
+       old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+                                      task_nice_ioprio(current));
+       set_task_ioprio(current, BTRFS_IOPRIO_READA);
         __reada_start_machine(fs_info);
+       set_task_ioprio(current, old_ioprio);
  }
  
  static void __reada_start_machine(struct btrfs_fs_info *fs_info)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c

index 2f3d6f917fb3373c02335b6912fcba1006f5fabe..a38cfa4f251ec1065410f561188c4adf5868cea3 100644 (file)
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -50,7 +50,7 @@ struct scrub_dev;
  struct scrub_page {
         struct scrub_block      *sblock;
         struct page             *page;
-       struct block_device     *bdev;
+       struct btrfs_device     *dev;
         u64                     flags;  /* extent flags */
         u64                     generation;
         u64                     logical;
@@ -86,6 +86,7 @@ struct scrub_block {
                 unsigned int    header_error:1;
                 unsigned int    checksum_error:1;
                 unsigned int    no_io_error_seen:1;
+               unsigned int    generation_error:1; /* also sets header_error */
         };
  };
  
@@ -675,6 +676,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 sdev->stat.read_errors++;
                 sdev->stat.uncorrectable_errors++;
                 spin_unlock(&sdev->stat_lock);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
                 goto out;
         }
  
@@ -686,6 +689,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 sdev->stat.read_errors++;
                 sdev->stat.uncorrectable_errors++;
                 spin_unlock(&sdev->stat_lock);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
                 goto out;
         }
         BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
@@ -699,6 +704,8 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 sdev->stat.read_errors++;
                 sdev->stat.uncorrectable_errors++;
                 spin_unlock(&sdev->stat_lock);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
                 goto out;
         }
  
@@ -725,12 +732,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 spin_unlock(&sdev->stat_lock);
                 if (__ratelimit(&_rs))
                         scrub_print_warning("i/o error", sblock_to_check);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_READ_ERRS);
         } else if (sblock_bad->checksum_error) {
                 spin_lock(&sdev->stat_lock);
                 sdev->stat.csum_errors++;
                 spin_unlock(&sdev->stat_lock);
                 if (__ratelimit(&_rs))
                         scrub_print_warning("checksum error", sblock_to_check);
+               btrfs_dev_stat_inc_and_print(sdev->dev,
+                                            BTRFS_DEV_STAT_CORRUPTION_ERRS);
         } else if (sblock_bad->header_error) {
                 spin_lock(&sdev->stat_lock);
                 sdev->stat.verify_errors++;
@@ -738,6 +749,12 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
                 if (__ratelimit(&_rs))
                         scrub_print_warning("checksum/header error",
                                             sblock_to_check);
+               if (sblock_bad->generation_error)
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_GENERATION_ERRS);
+               else
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
         }
  
         if (sdev->readonly)
@@ -998,8 +1015,8 @@ static int scrub_setup_recheck_block(struct scrub_dev *sdev,
                         page = sblock->pagev + page_index;
                         page->logical = logical;
                         page->physical = bbio->stripes[mirror_index].physical;
-                       /* for missing devices, bdev is NULL */
-                       page->bdev = bbio->stripes[mirror_index].dev->bdev;
+                       /* for missing devices, dev->bdev is NULL */
+                       page->dev = bbio->stripes[mirror_index].dev;
                         page->mirror_num = mirror_index + 1;
                         page->page = alloc_page(GFP_NOFS);
                         if (!page->page) {
@@ -1043,7 +1060,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                 struct scrub_page *page = sblock->pagev + page_num;
                 DECLARE_COMPLETION_ONSTACK(complete);
  
-               if (page->bdev == NULL) {
+               if (page->dev->bdev == NULL) {
                         page->io_error = 1;
                         sblock->no_io_error_seen = 0;
                         continue;
@@ -1053,7 +1070,7 @@ static int scrub_recheck_block(struct btrfs_fs_info *fs_info,
                 bio = bio_alloc(GFP_NOFS, 1);
                 if (!bio)
                         return -EIO;
-               bio->bi_bdev = page->bdev;
+               bio->bi_bdev = page->dev->bdev;
                 bio->bi_sector = page->physical >> 9;
                 bio->bi_end_io = scrub_complete_bio_end_io;
                 bio->bi_private = &complete;
@@ -1102,11 +1119,14 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                 h = (struct btrfs_header *)mapped_buffer;
  
                 if (sblock->pagev[0].logical != le64_to_cpu(h->bytenr) ||
-                   generation != le64_to_cpu(h->generation) ||
                     memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
                     memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
-                          BTRFS_UUID_SIZE))
+                          BTRFS_UUID_SIZE)) {
                         sblock->header_error = 1;
+               } else if (generation != le64_to_cpu(h->generation)) {
+                       sblock->header_error = 1;
+                       sblock->generation_error = 1;
+               }
                 csum = h->csum;
         } else {
                 if (!have_csum)
@@ -1182,7 +1202,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                 bio = bio_alloc(GFP_NOFS, 1);
                 if (!bio)
                         return -EIO;
-               bio->bi_bdev = page_bad->bdev;
+               bio->bi_bdev = page_bad->dev->bdev;
                 bio->bi_sector = page_bad->physical >> 9;
                 bio->bi_end_io = scrub_complete_bio_end_io;
                 bio->bi_private = &complete;
@@ -1196,6 +1216,12 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
  
                 /* this will also unplug the queue */
                 wait_for_completion(&complete);
+               if (!bio_flagged(bio, BIO_UPTODATE)) {
+                       btrfs_dev_stat_inc_and_print(page_bad->dev,
+                               BTRFS_DEV_STAT_WRITE_ERRS);
+                       bio_put(bio);
+                       return -EIO;
+               }
                 bio_put(bio);
         }
  
@@ -1352,7 +1378,8 @@ static int scrub_checksum_super(struct scrub_block *sblock)
         u64 mapped_size;
         void *p;
         u32 crc = ~(u32)0;
-       int fail = 0;
+       int fail_gen = 0;
+       int fail_cor = 0;
         u64 len;
         int index;
  
@@ -1363,13 +1390,13 @@ static int scrub_checksum_super(struct scrub_block *sblock)
         memcpy(on_disk_csum, s->csum, sdev->csum_size);
  
         if (sblock->pagev[0].logical != le64_to_cpu(s->bytenr))
-               ++fail;
+               ++fail_cor;
  
         if (sblock->pagev[0].generation != le64_to_cpu(s->generation))
-               ++fail;
+               ++fail_gen;
  
         if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
-               ++fail;
+               ++fail_cor;
  
         len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
         mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
@@ -1394,9 +1421,9 @@ static int scrub_checksum_super(struct scrub_block *sblock)
  
         btrfs_csum_final(crc, calculated_csum);
         if (memcmp(calculated_csum, on_disk_csum, sdev->csum_size))
-               ++fail;
+               ++fail_cor;
  
-       if (fail) {
+       if (fail_cor + fail_gen) {
                 /*
                  * if we find an error in a super block, we just report it.
                  * They will get written with the next transaction commit
@@ -1405,9 +1432,15 @@ static int scrub_checksum_super(struct scrub_block *sblock)
                 spin_lock(&sdev->stat_lock);
                 ++sdev->stat.super_errors;
                 spin_unlock(&sdev->stat_lock);
+               if (fail_cor)
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_CORRUPTION_ERRS);
+               else
+                       btrfs_dev_stat_inc_and_print(sdev->dev,
+                               BTRFS_DEV_STAT_GENERATION_ERRS);
         }
  
-       return fail;
+       return fail_cor + fail_gen;
  }
  
  static void scrub_block_get(struct scrub_block *sblock)
@@ -1551,7 +1584,7 @@ static int scrub_pages(struct scrub_dev *sdev, u64 logical, u64 len,
                         return -ENOMEM;
                 }
                 spage->sblock = sblock;
-               spage->bdev = sdev->dev->bdev;
+               spage->dev = sdev->dev;
                 spage->flags = flags;
                 spage->generation = gen;
                 spage->logical = logical;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c

index c5f8fca4195fca9eb3806ebfbccf52d03049691e..96eb9fef7bd279584cf4dd8b6ed42cc09e425c1d 100644 (file)
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -188,7 +188,8 @@ void btrfs_printk(struct btrfs_fs_info *fs_info, const char *fmt, ...)
         va_start(args, fmt);
  
         if (fmt[0] == '<' && isdigit(fmt[1]) && fmt[2] == '>') {
-               strncpy(lvl, fmt, 3);
+               memcpy(lvl, fmt, 3);
+               lvl[3] = '\0';
                 fmt += 3;
                 type = logtypes[fmt[1] - '0'];
         } else
@@ -435,11 +436,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                 case Opt_thread_pool:
                         intarg = 0;
                         match_int(&args[0], &intarg);
-                       if (intarg) {
+                       if (intarg)
                                 info->thread_pool_size = intarg;
-                               printk(KERN_INFO "btrfs: thread pool %d\n",
-                                      info->thread_pool_size);
-                       }
                         break;
                 case Opt_max_inline:
                         num = match_strdup(&args[0]);
@@ -769,7 +767,7 @@ static int btrfs_fill_super(struct super_block *sb,
  #ifdef CONFIG_BTRFS_FS_POSIX_ACL
         sb->s_flags |= MS_POSIXACL;
  #endif
-
+       sb->s_flags |= MS_I_VERSION;
         err = open_ctree(sb, fs_devices, (char *)data);
         if (err) {
                 printk("btrfs: open_ctree failed\n");
@@ -925,63 +923,48 @@ static inline int is_subvolume_inode(struct inode *inode)
   */
  static char *setup_root_args(char *args)
  {
-       unsigned copied = 0;
-       unsigned len = strlen(args) + 2;
-       char *pos;
-       char *ret;
+       unsigned len = strlen(args) + 2 + 1;
+       char *src, *dst, *buf;
  
         /*
-        * We need the same args as before, but minus
-        *
-        * subvol=a
-        *
-        * and add
-        *
-        * subvolid=0
+        * We need the same args as before, but with this substitution:
+        * s!subvol=[^,]+!subvolid=0!
          *
-        * which is a difference of 2 characters, so we allocate strlen(args) +
-        * 2 characters.
+        * Since the replacement string is up to 2 bytes longer than the
+        * original, allocate strlen(args) + 2 + 1 bytes.
          */
-       ret = kzalloc(len * sizeof(char), GFP_NOFS);
-       if (!ret)
-               return NULL;
-       pos = strstr(args, "subvol=");
  
+       src = strstr(args, "subvol=");
         /* This shouldn't happen, but just in case.. */
-       if (!pos) {
-               kfree(ret);
+       if (!src)
+               return NULL;
+
+       buf = dst = kmalloc(len, GFP_NOFS);
+       if (!buf)
                 return NULL;
-       }
  
         /*
-        * The subvol=<> arg is not at the front of the string, copy everybody
-        * up to that into ret.
+        * If the subvol= arg is not at the start of the string,
+        * copy whatever precedes it into buf.
          */
-       if (pos != args) {
-               *pos = '\0';
-               strcpy(ret, args);
-               copied += strlen(args);
-               pos++;
+       if (src != args) {
+               *src++ = '\0';
+               strcpy(buf, args);
+               dst += strlen(args);
         }
  
-       strncpy(ret + copied, "subvolid=0", len - copied);
-
-       /* Length of subvolid=0 */
-       copied += 10;
+       strcpy(dst, "subvolid=0");
+       dst += strlen("subvolid=0");
  
         /*
-        * If there is no , after the subvol= option then we know there's no
-        * other options and we can just return.
+        * If there is a "," after the original subvol=... string,
+        * copy that suffix into our buffer.  Otherwise, we're done.
          */
-       pos = strchr(pos, ',');
-       if (!pos)
-               return ret;
+       src = strchr(src, ',');
+       if (src)
+               strcpy(dst, src);
  
-       /* Copy the rest of the arguments into our buffer */
-       strncpy(ret + copied, pos, len - copied);
-       copied += strlen(pos);
-
-       return ret;
+       return buf;
  }
  
  static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -1118,6 +1101,40 @@ error_fs_info:
         return ERR_PTR(error);
  }
  
+static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
+{
+       spin_lock_irq(&workers->lock);
+       workers->max_workers = new_limit;
+       spin_unlock_irq(&workers->lock);
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+                                    int new_pool_size, int old_pool_size)
+{
+       if (new_pool_size == old_pool_size)
+               return;
+
+       fs_info->thread_pool_size = new_pool_size;
+
+       printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+              old_pool_size, new_pool_size);
+
+       btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
+       btrfs_set_max_workers(&fs_info->workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
+       btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
+       btrfs_set_max_workers(&fs_info->scrub_workers, new_pool_size);
+}
+
  static int btrfs_remount(struct super_block *sb, int *flags, char *data)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1137,6 +1154,9 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                 goto restore;
         }
  
+       btrfs_resize_thread_pool(fs_info,
+               fs_info->thread_pool_size, old_thread_pool_size);
+
         if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
                 return 0;
  
@@ -1180,7 +1200,8 @@ restore:
         fs_info->compress_type = old_compress_type;
         fs_info->max_inline = old_max_inline;
         fs_info->alloc_start = old_alloc_start;
-       fs_info->thread_pool_size = old_thread_pool_size;
+       btrfs_resize_thread_pool(fs_info,
+               old_thread_pool_size, fs_info->thread_pool_size);
         fs_info->metadata_ratio = old_metadata_ratio;
         return ret;
  }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c

index 36422254ef6765c14290a2373fa6d83cf2d364d5..1791c6e3d83487d82c9ffe80ab0239976cfd1c96 100644 (file)
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -28,6 +28,7 @@
  #include "locking.h"
  #include "tree-log.h"
  #include "inode-map.h"
+#include "volumes.h"
  
  #define BTRFS_ROOT_TRANS_TAG 0
  
@@ -55,48 +56,49 @@ static noinline void switch_commit_root(struct btrfs_root *root)
  static noinline int join_transaction(struct btrfs_root *root, int nofail)
  {
         struct btrfs_transaction *cur_trans;
+       struct btrfs_fs_info *fs_info = root->fs_info;
  
-       spin_lock(&root->fs_info->trans_lock);
+       spin_lock(&fs_info->trans_lock);
  loop:
         /* The file system has been taken offline. No new transactions. */
-       if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-               spin_unlock(&root->fs_info->trans_lock);
+       if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+               spin_unlock(&fs_info->trans_lock);
                 return -EROFS;
         }
  
-       if (root->fs_info->trans_no_join) {
+       if (fs_info->trans_no_join) {
                 if (!nofail) {
-                       spin_unlock(&root->fs_info->trans_lock);
+                       spin_unlock(&fs_info->trans_lock);
                         return -EBUSY;
                 }
         }
  
-       cur_trans = root->fs_info->running_transaction;
+       cur_trans = fs_info->running_transaction;
         if (cur_trans) {
                 if (cur_trans->aborted) {
-                       spin_unlock(&root->fs_info->trans_lock);
+                       spin_unlock(&fs_info->trans_lock);
                         return cur_trans->aborted;
                 }
                 atomic_inc(&cur_trans->use_count);
                 atomic_inc(&cur_trans->num_writers);
                 cur_trans->num_joined++;
-               spin_unlock(&root->fs_info->trans_lock);
+               spin_unlock(&fs_info->trans_lock);
                 return 0;
         }
-       spin_unlock(&root->fs_info->trans_lock);
+       spin_unlock(&fs_info->trans_lock);
  
         cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
         if (!cur_trans)
                 return -ENOMEM;
  
-       spin_lock(&root->fs_info->trans_lock);
-       if (root->fs_info->running_transaction) {
+       spin_lock(&fs_info->trans_lock);
+       if (fs_info->running_transaction) {
                 /*
                  * someone started a transaction after we unlocked.  Make sure
                  * to redo the trans_no_join checks above
                  */
                 kmem_cache_free(btrfs_transaction_cachep, cur_trans);
-               cur_trans = root->fs_info->running_transaction;
+               cur_trans = fs_info->running_transaction;
                 goto loop;
         }
  
@@ -121,20 +123,38 @@ loop:
         cur_trans->delayed_refs.flushing = 0;
         cur_trans->delayed_refs.run_delayed_start = 0;
         cur_trans->delayed_refs.seq = 1;
+
+       /*
+        * although the tree mod log is per file system and not per transaction,
+        * the log must never go across transaction boundaries.
+        */
+       smp_mb();
+       if (!list_empty(&fs_info->tree_mod_seq_list)) {
+               printk(KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+                       "creating a fresh transaction\n");
+               WARN_ON(1);
+       }
+       if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log)) {
+               printk(KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+                       "creating a fresh transaction\n");
+               WARN_ON(1);
+       }
+       atomic_set(&fs_info->tree_mod_seq, 0);
+
         init_waitqueue_head(&cur_trans->delayed_refs.seq_wait);
         spin_lock_init(&cur_trans->commit_lock);
         spin_lock_init(&cur_trans->delayed_refs.lock);
         INIT_LIST_HEAD(&cur_trans->delayed_refs.seq_head);
  
         INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-       list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+       list_add_tail(&cur_trans->list, &fs_info->trans_list);
         extent_io_tree_init(&cur_trans->dirty_pages,
-                            root->fs_info->btree_inode->i_mapping);
-       root->fs_info->generation++;
-       cur_trans->transid = root->fs_info->generation;
-       root->fs_info->running_transaction = cur_trans;
+                            fs_info->btree_inode->i_mapping);
+       fs_info->generation++;
+       cur_trans->transid = fs_info->generation;
+       fs_info->running_transaction = cur_trans;
         cur_trans->aborted = 0;
-       spin_unlock(&root->fs_info->trans_lock);
+       spin_unlock(&fs_info->trans_lock);
  
         return 0;
  }
@@ -758,6 +778,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
         if (ret)
                 return ret;
  
+       ret = btrfs_run_dev_stats(trans, root->fs_info);
+       BUG_ON(ret);
+
         while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                 next = fs_info->dirty_cowonly_roots.next;
                 list_del_init(next);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c

index eb1ae908582cc51162a61798c80f3ed38e7ab6e8..2017d0ff511ca3304dad46e85045ad2ab28d4e75 100644 (file)
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1628,7 +1628,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
         int i;
         int ret;
  
-       btrfs_read_buffer(eb, gen);
+       ret = btrfs_read_buffer(eb, gen);
+       if (ret)
+               return ret;
  
         level = btrfs_header_level(eb);
  
@@ -1749,7 +1751,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
  
                         path->slots[*level]++;
                         if (wc->free) {
-                               btrfs_read_buffer(next, ptr_gen);
+                               ret = btrfs_read_buffer(next, ptr_gen);
+                               if (ret) {
+                                       free_extent_buffer(next);
+                                       return ret;
+                               }
  
                                 btrfs_tree_lock(next);
                                 btrfs_set_lock_blocking(next);
@@ -1766,7 +1772,11 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                         free_extent_buffer(next);
                         continue;
                 }
-               btrfs_read_buffer(next, ptr_gen);
+               ret = btrfs_read_buffer(next, ptr_gen);
+               if (ret) {
+                       free_extent_buffer(next);
+                       return ret;
+               }
  
                 WARN_ON(*level <= 0);
                 if (path->nodes[*level-1])
@@ -2657,6 +2667,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                 btrfs_release_path(path);
         }
         btrfs_release_path(path);
+       if (ret > 0)
+               ret = 0;
         return ret;
  }
  
@@ -3028,21 +3040,6 @@ out:
         return ret;
  }
  
-static int inode_in_log(struct btrfs_trans_handle *trans,
-                struct inode *inode)
-{
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       int ret = 0;
-
-       mutex_lock(&root->log_mutex);
-       if (BTRFS_I(inode)->logged_trans == trans->transid &&
-           BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
-               ret = 1;
-       mutex_unlock(&root->log_mutex);
-       return ret;
-}
-
-
  /*
   * helper function around btrfs_log_inode to make sure newly created
   * parent directories also end up in the log.  A minimal inode and backref
@@ -3083,7 +3080,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         if (ret)
                 goto end_no_trans;
  
-       if (inode_in_log(trans, inode)) {
+       if (btrfs_inode_in_log(inode, trans->transid)) {
                 ret = BTRFS_NO_LOG_SYNC;
                 goto end_no_trans;
         }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c

index 12f5147bd2b1ae2a6016e7283c72ceccb44283b7..ab942f46b3dd81e06348c4950901f3e4eef87016 100644 (file)
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -23,9 +23,9 @@
   *
   * ulist = ulist_alloc();
   * ulist_add(ulist, root);
- * elem = NULL;
+ * ULIST_ITER_INIT(&uiter);
   *
- * while ((elem = ulist_next(ulist, elem)) {
+ * while ((elem = ulist_next(ulist, &uiter)) {
   *     for (all child nodes n in elem)
   *             ulist_add(ulist, n);
   *     do something useful with the node;
@@ -95,7 +95,7 @@ EXPORT_SYMBOL(ulist_reinit);
   *
   * The allocated ulist will be returned in an initialized state.
   */
-struct ulist *ulist_alloc(unsigned long gfp_mask)
+struct ulist *ulist_alloc(gfp_t gfp_mask)
  {
         struct ulist *ulist = kmalloc(sizeof(*ulist), gfp_mask);
  
@@ -144,13 +144,22 @@ EXPORT_SYMBOL(ulist_free);
   * unaltered.
   */
  int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-             unsigned long gfp_mask)
+             gfp_t gfp_mask)
+{
+       return ulist_add_merge(ulist, val, aux, NULL, gfp_mask);
+}
+
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+                   unsigned long *old_aux, gfp_t gfp_mask)
  {
         int i;
  
         for (i = 0; i < ulist->nnodes; ++i) {
-               if (ulist->nodes[i].val == val)
+               if (ulist->nodes[i].val == val) {
+                       if (old_aux)
+                               *old_aux = ulist->nodes[i].aux;
                         return 0;
+               }
         }
  
         if (ulist->nnodes >= ulist->nodes_alloced) {
@@ -188,33 +197,26 @@ EXPORT_SYMBOL(ulist_add);
  /**
   * ulist_next - iterate ulist
   * @ulist:     ulist to iterate
- * @prev:      previously returned element or %NULL to start iteration
+ * @uiter:     iterator variable, initialized with ULIST_ITER_INIT(&iterator)
   *
   * Note: locking must be provided by the caller. In case of rwlocks only read
   *       locking is needed
   *
- * This function is used to iterate an ulist. The iteration is started with
- * @prev = %NULL. It returns the next element from the ulist or %NULL when the
+ * This function is used to iterate an ulist.
+ * It returns the next element from the ulist or %NULL when the
   * end is reached. No guarantee is made with respect to the order in which
   * the elements are returned. They might neither be returned in order of
   * addition nor in ascending order.
   * It is allowed to call ulist_add during an enumeration. Newly added items
   * are guaranteed to show up in the running enumeration.
   */
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev)
+struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter)
  {
-       int next;
-
         if (ulist->nnodes == 0)
                 return NULL;
-
-       if (!prev)
-               return &ulist->nodes[0];
-
-       next = (prev - ulist->nodes) + 1;
-       if (next < 0 || next >= ulist->nnodes)
+       if (uiter->i < 0 || uiter->i >= ulist->nnodes)
                 return NULL;
  
-       return &ulist->nodes[next];
+       return &ulist->nodes[uiter->i++];
  }
  EXPORT_SYMBOL(ulist_next);
diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h

index 2e25dec58ec0e56251fbca880d27cc927aac95dc..21bdc8ec813046ac56e3c7db0739bcdba7ac188a 100644 (file)
--- a/fs/btrfs/ulist.h
+++ b/fs/btrfs/ulist.h
@@ -24,6 +24,10 @@
   */
  #define ULIST_SIZE 16
  
+struct ulist_iterator {
+       int i;
+};
+
  /*
   * element of the list
   */
@@ -59,10 +63,15 @@ struct ulist {
  void ulist_init(struct ulist *ulist);
  void ulist_fini(struct ulist *ulist);
  void ulist_reinit(struct ulist *ulist);
-struct ulist *ulist_alloc(unsigned long gfp_mask);
+struct ulist *ulist_alloc(gfp_t gfp_mask);
  void ulist_free(struct ulist *ulist);
  int ulist_add(struct ulist *ulist, u64 val, unsigned long aux,
-             unsigned long gfp_mask);
-struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_node *prev);
+             gfp_t gfp_mask);
+int ulist_add_merge(struct ulist *ulist, u64 val, unsigned long aux,
+                   unsigned long *old_aux, gfp_t gfp_mask);
+struct ulist_node *ulist_next(struct ulist *ulist,
+                             struct ulist_iterator *uiter);
+
+#define ULIST_ITER_INIT(uiter) ((uiter)->i = 0)
  
  #endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c

index 1411b99555a4c1f138a6a3bf699842849d2b3e08..7782020996feccd4b7103528a4c2989230f79b71 100644 (file)
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -23,6 +23,7 @@
  #include <linux/random.h>
  #include <linux/iocontext.h>
  #include <linux/capability.h>
+#include <linux/ratelimit.h>
  #include <linux/kthread.h>
  #include <asm/div64.h>
  #include "compat.h"
@@ -39,6 +40,8 @@ static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct btrfs_device *device);
  static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  
  static DEFINE_MUTEX(uuid_mutex);
  static LIST_HEAD(fs_uuids);
@@ -361,6 +364,7 @@ static noinline int device_list_add(const char *path,
                         return -ENOMEM;
                 }
                 device->devid = devid;
+               device->dev_stats_valid = 0;
                 device->work.func = pending_bios_fn;
                 memcpy(device->uuid, disk_super->dev_item.uuid,
                        BTRFS_UUID_SIZE);
@@ -1633,7 +1637,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         int ret = 0;
  
         if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
-               return -EINVAL;
+               return -EROFS;
  
         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
                                   root->fs_info->bdev_holder);
@@ -4001,13 +4005,58 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
         return 0;
  }
  
+static void *merge_stripe_index_into_bio_private(void *bi_private,
+                                                unsigned int stripe_index)
+{
+       /*
+        * with single, dup, RAID0, RAID1 and RAID10, stripe_index is
+        * at most 1.
+        * The alternative solution (instead of stealing bits from the
+        * pointer) would be to allocate an intermediate structure
+        * that contains the old private pointer plus the stripe_index.
+        */
+       BUG_ON((((uintptr_t)bi_private) & 3) != 0);
+       BUG_ON(stripe_index > 3);
+       return (void *)(((uintptr_t)bi_private) | stripe_index);
+}
+
+static struct btrfs_bio *extract_bbio_from_bio_private(void *bi_private)
+{
+       return (struct btrfs_bio *)(((uintptr_t)bi_private) & ~((uintptr_t)3));
+}
+
+static unsigned int extract_stripe_index_from_bio_private(void *bi_private)
+{
+       return (unsigned int)((uintptr_t)bi_private) & 3;
+}
+
  static void btrfs_end_bio(struct bio *bio, int err)
  {
-       struct btrfs_bio *bbio = bio->bi_private;
+       struct btrfs_bio *bbio = extract_bbio_from_bio_private(bio->bi_private);
         int is_orig_bio = 0;
  
-       if (err)
+       if (err) {
                 atomic_inc(&bbio->error);
+               if (err == -EIO || err == -EREMOTEIO) {
+                       unsigned int stripe_index =
+                               extract_stripe_index_from_bio_private(
+                                       bio->bi_private);
+                       struct btrfs_device *dev;
+
+                       BUG_ON(stripe_index >= bbio->num_stripes);
+                       dev = bbio->stripes[stripe_index].dev;
+                       if (bio->bi_rw & WRITE)
+                               btrfs_dev_stat_inc(dev,
+                                                  BTRFS_DEV_STAT_WRITE_ERRS);
+                       else
+                               btrfs_dev_stat_inc(dev,
+                                                  BTRFS_DEV_STAT_READ_ERRS);
+                       if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+                               btrfs_dev_stat_inc(dev,
+                                                  BTRFS_DEV_STAT_FLUSH_ERRS);
+                       btrfs_dev_stat_print_on_error(dev);
+               }
+       }
  
         if (bio == bbio->orig_bio)
                 is_orig_bio = 1;
@@ -4149,6 +4198,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                         bio = first_bio;
                 }
                 bio->bi_private = bbio;
+               bio->bi_private = merge_stripe_index_into_bio_private(
+                               bio->bi_private, (unsigned int)dev_nr);
                 bio->bi_end_io = btrfs_end_bio;
                 bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
                 dev = bbio->stripes[dev_nr].dev;
@@ -4509,6 +4560,28 @@ int btrfs_read_sys_array(struct btrfs_root *root)
         return ret;
  }
  
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+                                                  u64 logical, int mirror_num)
+{
+       struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+       int ret;
+       u64 map_length = 0;
+       struct btrfs_bio *bbio = NULL;
+       struct btrfs_device *device;
+
+       BUG_ON(mirror_num == 0);
+       ret = btrfs_map_block(map_tree, WRITE, logical, &map_length, &bbio,
+                             mirror_num);
+       if (ret) {
+               BUG_ON(bbio != NULL);
+               return NULL;
+       }
+       BUG_ON(mirror_num != bbio->mirror_num);
+       device = bbio->stripes[mirror_num - 1].dev;
+       kfree(bbio);
+       return device;
+}
+
  int btrfs_read_chunk_tree(struct btrfs_root *root)
  {
         struct btrfs_path *path;
@@ -4583,3 +4656,230 @@ error:
         btrfs_free_path(path);
         return ret;
  }
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+       int i;
+
+       for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+               btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_key key;
+       struct btrfs_key found_key;
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct extent_buffer *eb;
+       int slot;
+       int ret = 0;
+       struct btrfs_device *device;
+       struct btrfs_path *path = NULL;
+       int i;
+
+       path = btrfs_alloc_path();
+       if (!path) {
+               ret = -ENOMEM;
+               goto out;
+       }
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               int item_size;
+               struct btrfs_dev_stats_item *ptr;
+
+               key.objectid = 0;
+               key.type = BTRFS_DEV_STATS_KEY;
+               key.offset = device->devid;
+               ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+               if (ret) {
+                       printk(KERN_WARNING "btrfs: no dev_stats entry found for device %s (devid %llu) (OK on first mount after mkfs)\n",
+                              device->name, (unsigned long long)device->devid);
+                       __btrfs_reset_dev_stats(device);
+                       device->dev_stats_valid = 1;
+                       btrfs_release_path(path);
+                       continue;
+               }
+               slot = path->slots[0];
+               eb = path->nodes[0];
+               btrfs_item_key_to_cpu(eb, &found_key, slot);
+               item_size = btrfs_item_size_nr(eb, slot);
+
+               ptr = btrfs_item_ptr(eb, slot,
+                                    struct btrfs_dev_stats_item);
+
+               for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+                       if (item_size >= (1 + i) * sizeof(__le64))
+                               btrfs_dev_stat_set(device, i,
+                                       btrfs_dev_stats_value(eb, ptr, i));
+                       else
+                               btrfs_dev_stat_reset(device, i);
+               }
+
+               device->dev_stats_valid = 1;
+               btrfs_dev_stat_print_on_load(device);
+               btrfs_release_path(path);
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+       btrfs_free_path(path);
+       return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *dev_root,
+                               struct btrfs_device *device)
+{
+       struct btrfs_path *path;
+       struct btrfs_key key;
+       struct extent_buffer *eb;
+       struct btrfs_dev_stats_item *ptr;
+       int ret;
+       int i;
+
+       key.objectid = 0;
+       key.type = BTRFS_DEV_STATS_KEY;
+       key.offset = device->devid;
+
+       path = btrfs_alloc_path();
+       BUG_ON(!path);
+       ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+       if (ret < 0) {
+               printk(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+                      ret, device->name);
+               goto out;
+       }
+
+       if (ret == 0 &&
+           btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+               /* need to delete old one and insert a new one */
+               ret = btrfs_del_item(trans, dev_root, path);
+               if (ret != 0) {
+                       printk(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+                              device->name, ret);
+                       goto out;
+               }
+               ret = 1;
+       }
+
+       if (ret == 1) {
+               /* need to insert a new item */
+               btrfs_release_path(path);
+               ret = btrfs_insert_empty_item(trans, dev_root, path,
+                                             &key, sizeof(*ptr));
+               if (ret < 0) {
+                       printk(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+                              device->name, ret);
+                       goto out;
+               }
+       }
+
+       eb = path->nodes[0];
+       ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+       for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+               btrfs_set_dev_stats_value(eb, ptr, i,
+                                         btrfs_dev_stat_read(device, i));
+       btrfs_mark_buffer_dirty(eb);
+
+out:
+       btrfs_free_path(path);
+       return ret;
+}
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info)
+{
+       struct btrfs_root *dev_root = fs_info->dev_root;
+       struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+       struct btrfs_device *device;
+       int ret = 0;
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       list_for_each_entry(device, &fs_devices->devices, dev_list) {
+               if (!device->dev_stats_valid || !device->dev_stats_dirty)
+                       continue;
+
+               ret = update_dev_stat_item(trans, dev_root, device);
+               if (!ret)
+                       device->dev_stats_dirty = 0;
+       }
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+       return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+       btrfs_dev_stat_inc(dev, index);
+       btrfs_dev_stat_print_on_error(dev);
+}
+
+void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+       if (!dev->dev_stats_valid)
+               return;
+       printk_ratelimited(KERN_ERR
+                          "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+                          dev->name,
+                          btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+                          btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+                          btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+                          btrfs_dev_stat_read(dev,
+                                              BTRFS_DEV_STAT_CORRUPTION_ERRS),
+                          btrfs_dev_stat_read(dev,
+                                              BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+       printk(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+              dev->name,
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+              btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+                       struct btrfs_ioctl_get_dev_stats *stats,
+                       int reset_after_read)
+{
+       struct btrfs_device *dev;
+       struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+       int i;
+
+       mutex_lock(&fs_devices->device_list_mutex);
+       dev = btrfs_find_device(root, stats->devid, NULL, NULL);
+       mutex_unlock(&fs_devices->device_list_mutex);
+
+       if (!dev) {
+               printk(KERN_WARNING
+                      "btrfs: get dev_stats failed, device not found\n");
+               return -ENODEV;
+       } else if (!dev->dev_stats_valid) {
+               printk(KERN_WARNING
+                      "btrfs: get dev_stats failed, not yet valid\n");
+               return -ENODEV;
+       } else if (reset_after_read) {
+               for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+                       if (stats->nr_items > i)
+                               stats->values[i] =
+                                       btrfs_dev_stat_read_and_reset(dev, i);
+                       else
+                               btrfs_dev_stat_reset(dev, i);
+               }
+       } else {
+               for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+                       if (stats->nr_items > i)
+                               stats->values[i] = btrfs_dev_stat_read(dev, i);
+       }
+       if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+               stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+       return 0;
+}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h

index bb6b03f97aaa089793d667fae93335373773a7eb..3406a88ca83e023429b8af19f2d6aa64d4cac6f8 100644 (file)
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -22,6 +22,7 @@
  #include <linux/bio.h>
  #include <linux/sort.h>
  #include "async-thread.h"
+#include "ioctl.h"
  
  #define BTRFS_STRIPE_LEN       (64 * 1024)
  
@@ -106,6 +107,11 @@ struct btrfs_device {
         struct completion flush_wait;
         int nobarriers;
  
+       /* disk I/O failure stats. For detailed description refer to
+        * enum btrfs_dev_stat_values in ioctl.h */
+       int dev_stats_valid;
+       int dev_stats_dirty; /* counters need to be written to disk */
+       atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
  };
  
  struct btrfs_fs_devices {
@@ -281,4 +287,50 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
  int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
  int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
                          u64 *start, u64 *max_avail);
+struct btrfs_device *btrfs_find_device_for_logical(struct btrfs_root *root,
+                                                  u64 logical, int mirror_num);
+void btrfs_dev_stat_print_on_error(struct btrfs_device *device);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+                       struct btrfs_ioctl_get_dev_stats *stats,
+                       int reset_after_read);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+                       struct btrfs_fs_info *fs_info);
+
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+                                     int index)
+{
+       atomic_inc(dev->dev_stat_values + index);
+       dev->dev_stats_dirty = 1;
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+                                     int index)
+{
+       return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+                                               int index)
+{
+       int ret;
+
+       ret = atomic_xchg(dev->dev_stat_values + index, 0);
+       dev->dev_stats_dirty = 1;
+       return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+                                     int index, unsigned long val)
+{
+       atomic_set(dev->dev_stat_values + index, val);
+       dev->dev_stats_dirty = 1;
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+                                       int index)
+{
+       btrfs_dev_stat_set(dev, index, 0);
+}
  #endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c

index e7a5659087e66f93769bc750562d21294c9bd2b6..3f4e2d69e83a13cb66f3f3a56024f53f5299f5c4 100644 (file)
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -196,6 +196,7 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
         if (ret)
                 goto out;
  
+       inode_inc_iversion(inode);
         inode->i_ctime = CURRENT_TIME;
         ret = btrfs_update_inode(trans, root, inode);
         BUG_ON(ret);
diff --git a/fs/buffer.c b/fs/buffer.c

index ad5938ca357c270ace08388401176f22a6343571..838a9cf246bd0fa561ab66295f9bb3df77e0c6a2 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3152,7 +3152,7 @@ SYSCALL_DEFINE2(bdflush, int, func, long, data)
  /*
   * Buffer-head allocation
   */
-static struct kmem_cache *bh_cachep;
+static struct kmem_cache *bh_cachep __read_mostly;
  
  /*
   * Once the number of bh's in the machine exceeds this level, we start
diff --git a/fs/ceph/export.c b/fs/ceph/export.c

index fbb2a643ef10a1f75c4918f165c9e3a22a603a86..8e1b60e557b65bea0df86a881376456658a9cffd 100644 (file)
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -40,38 +40,49 @@ struct ceph_nfs_confh {
         u32 parent_name_hash;
  } __attribute__ ((packed));
  
-static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
-                         int connectable)
+/*
+ * The presence of @parent_inode here tells us whether NFS wants a
+ * connectable file handle.  However, we want to make a connectionable
+ * file handle unconditionally so that the MDS gets as much of a hint
+ * as possible.  That means we only use @parent_dentry to indicate
+ * whether nfsd wants a connectable fh, and whether we should indicate
+ * failure from a too-small @max_len.
+ */
+static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
+                         struct inode *parent_inode)
  {
         int type;
         struct ceph_nfs_fh *fh = (void *)rawfh;
         struct ceph_nfs_confh *cfh = (void *)rawfh;
-       struct dentry *parent;
-       struct inode *inode = dentry->d_inode;
         int connected_handle_length = sizeof(*cfh)/4;
         int handle_length = sizeof(*fh)/4;
+       struct dentry *dentry = d_find_alias(inode);
+       struct dentry *parent;
  
         /* don't re-export snaps */
         if (ceph_snap(inode) != CEPH_NOSNAP)
                 return -EINVAL;
  
-       spin_lock(&dentry->d_lock);
-       parent = dentry->d_parent;
-       if (*max_len >= connected_handle_length) {
+       /* if we found an alias, generate a connectable fh */
+       if (*max_len >= connected_handle_length && dentry) {
                 dout("encode_fh %p connectable\n", dentry);
-               cfh->ino = ceph_ino(dentry->d_inode);
+               spin_lock(&dentry->d_lock);
+               parent = dentry->d_parent;
+               cfh->ino = ceph_ino(inode);
                 cfh->parent_ino = ceph_ino(parent->d_inode);
                 cfh->parent_name_hash = ceph_dentry_hash(parent->d_inode,
                                                          dentry);
                 *max_len = connected_handle_length;
                 type = 2;
+               spin_unlock(&dentry->d_lock);
         } else if (*max_len >= handle_length) {
-               if (connectable) {
+               if (parent_inode) {
+                       /* nfsd wants connectable */
                         *max_len = connected_handle_length;
                         type = 255;
                 } else {
                         dout("encode_fh %p\n", dentry);
-                       fh->ino = ceph_ino(dentry->d_inode);
+                       fh->ino = ceph_ino(inode);
                         *max_len = handle_length;
                         type = 1;
                 }
@@ -79,7 +90,6 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
                 *max_len = handle_length;
                 type = 255;
         }
-       spin_unlock(&dentry->d_lock);
         return type;
  }
  
diff --git a/fs/compat.c b/fs/compat.c

index 3adf3d4c2cd9e1a5da0da2bf9a6b984a0d276edf..6161255fac45648efdfe437d9d880d390268d14f 100644 (file)
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -871,12 +871,12 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
  {
         int error;
         struct file *file;
+       int fput_needed;
         struct compat_readdir_callback buf;
  
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
         if (!file)
-               goto out;
+               return -EBADF;
  
         buf.result = 0;
         buf.dirent = dirent;
@@ -885,8 +885,7 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd,
         if (buf.result)
                 error = buf.result;
  
-       fput(file);
-out:
+       fput_light(file, fput_needed);
         return error;
  }
  
@@ -953,16 +952,15 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
         struct file * file;
         struct compat_linux_dirent __user * lastdirent;
         struct compat_getdents_callback buf;
+       int fput_needed;
         int error;
  
-       error = -EFAULT;
         if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
  
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
         if (!file)
-               goto out;
+               return -EBADF;
  
         buf.current_dir = dirent;
         buf.previous = NULL;
@@ -979,8 +977,7 @@ asmlinkage long compat_sys_getdents(unsigned int fd,
                 else
                         error = count - buf.count;
         }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
         return error;
  }
  
@@ -1041,16 +1038,15 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
         struct file * file;
         struct linux_dirent64 __user * lastdirent;
         struct compat_getdents_callback64 buf;
+       int fput_needed;
         int error;
  
-       error = -EFAULT;
         if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
  
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
         if (!file)
-               goto out;
+               return -EBADF;
  
         buf.current_dir = dirent;
         buf.previous = NULL;
@@ -1068,8 +1064,7 @@ asmlinkage long compat_sys_getdents64(unsigned int fd,
                 else
                         error = count - buf.count;
         }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
         return error;
  }
  #endif /* ! __ARCH_OMIT_COMPAT_SYS_GETDENTS64 */
diff --git a/fs/dcache.c b/fs/dcache.c

index 4435d8b329044da3b48c83dfe555409464797d0d..85c9e2bff8e65126eaca755e14d1ee4a15a27170 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -683,8 +683,6 @@ EXPORT_SYMBOL(dget_parent);
  /**
   * d_find_alias - grab a hashed alias of inode
   * @inode: inode in question
- * @want_discon:  flag, used by d_splice_alias, to request
- *          that only a DISCONNECTED alias be returned.
   *
   * If inode has a hashed alias, or is a directory and has any alias,
   * acquire the reference to alias and return it. Otherwise return NULL.
@@ -693,10 +691,9 @@ EXPORT_SYMBOL(dget_parent);
   * of a filesystem.
   *
   * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer
- * any other hashed alias over that one unless @want_discon is set,
- * in which case only return an IS_ROOT, DCACHE_DISCONNECTED alias.
+ * any other hashed alias over that.
   */
-static struct dentry *__d_find_alias(struct inode *inode, int want_discon)
+static struct dentry *__d_find_alias(struct inode *inode)
  {
         struct dentry *alias, *discon_alias;
  
@@ -708,7 +705,7 @@ again:
                         if (IS_ROOT(alias) &&
                             (alias->d_flags & DCACHE_DISCONNECTED)) {
                                 discon_alias = alias;
-                       } else if (!want_discon) {
+                       } else {
                                 __dget_dlock(alias);
                                 spin_unlock(&alias->d_lock);
                                 return alias;
@@ -739,7 +736,7 @@ struct dentry *d_find_alias(struct inode *inode)
  
         if (!list_empty(&inode->i_dentry)) {
                 spin_lock(&inode->i_lock);
-               de = __d_find_alias(inode, 0);
+               de = __d_find_alias(inode);
                 spin_unlock(&inode->i_lock);
         }
         return de;
@@ -1650,9 +1647,8 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
  
         if (inode && S_ISDIR(inode->i_mode)) {
                 spin_lock(&inode->i_lock);
-               new = __d_find_alias(inode, 1);
+               new = __d_find_any_alias(inode);
                 if (new) {
-                       BUG_ON(!(new->d_flags & DCACHE_DISCONNECTED));
                         spin_unlock(&inode->i_lock);
                         security_d_instantiate(new, inode);
                         d_move(new, dentry);
@@ -2482,7 +2478,7 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
                 struct dentry *alias;
  
                 /* Does an aliased dentry already exist? */
-               alias = __d_find_alias(inode, 0);
+               alias = __d_find_alias(inode);
                 if (alias) {
                         actual = alias;
                         write_seqlock(&rename_lock);
@@ -2575,7 +2571,7 @@ static int prepend_path(const struct path *path,
         bool slash = false;
         int error = 0;
  
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
         while (dentry != root->dentry || vfsmnt != root->mnt) {
                 struct dentry * parent;
  
@@ -2606,7 +2602,7 @@ static int prepend_path(const struct path *path,
                 error = prepend(buffer, buflen, "/", 1);
  
  out:
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
         return error;
  
  global_root:
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c

index ab35b113003b900ad3592217d64e6cef82fe8f9d..a07441a0a8789a9ee1e43f5be0d2b43ec3ee04e8 100644 (file)
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -660,11 +660,10 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
  {
         struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
         char *lower_buf;
-       size_t lower_bufsiz = PATH_MAX;
         mm_segment_t old_fs;
         int rc;
  
-       lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL);
+       lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
         if (!lower_buf) {
                 rc = -ENOMEM;
                 goto out;
@@ -673,58 +672,29 @@ static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf,
         set_fs(get_ds());
         rc = lower_dentry->d_inode->i_op->readlink(lower_dentry,
                                                    (char __user *)lower_buf,
-                                                  lower_bufsiz);
+                                                  PATH_MAX);
         set_fs(old_fs);
         if (rc < 0)
                 goto out;
-       lower_bufsiz = rc;
         rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry,
-                                                 lower_buf, lower_bufsiz);
+                                                 lower_buf, rc);
  out:
         kfree(lower_buf);
         return rc;
  }
  
-static int
-ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
  {
-       char *kbuf;
-       size_t kbufsiz, copied;
+       char *buf;
+       size_t len = PATH_MAX;
         int rc;
  
-       rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz);
+       rc = ecryptfs_readlink_lower(dentry, &buf, &len);
         if (rc)
                 goto out;
-       copied = min_t(size_t, bufsiz, kbufsiz);
-       rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied;
-       kfree(kbuf);
         fsstack_copy_attr_atime(dentry->d_inode,
                                 ecryptfs_dentry_to_lower(dentry)->d_inode);
-out:
-       return rc;
-}
-
-static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
-{
-       char *buf;
-       int len = PAGE_SIZE, rc;
-       mm_segment_t old_fs;
-
-       /* Released in ecryptfs_put_link(); only release here on error */
-       buf = kmalloc(len, GFP_KERNEL);
-       if (!buf) {
-               buf = ERR_PTR(-ENOMEM);
-               goto out;
-       }
-       old_fs = get_fs();
-       set_fs(get_ds());
-       rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
-       set_fs(old_fs);
-       if (rc < 0) {
-               kfree(buf);
-               buf = ERR_PTR(rc);
-       } else
-               buf[rc] = '\0';
+       buf[len] = '\0';
  out:
         nd_set_link(nd, buf);
         return NULL;
@@ -1153,7 +1123,7 @@ out:
  }
  
  const struct inode_operations ecryptfs_symlink_iops = {
-       .readlink = ecryptfs_readlink,
+       .readlink = generic_readlink,
         .follow_link = ecryptfs_follow_link,
         .put_link = ecryptfs_put_link,
         .permission = ecryptfs_permission,
diff --git a/fs/exec.c b/fs/exec.c

index 52c9e2ff6e6bd8b6f763e56ceafda431731cea9b..a79786a8d2c88d5b6c580859ef12496f43b4b0f4 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -280,10 +280,6 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
         INIT_LIST_HEAD(&vma->anon_vma_chain);
  
-       err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
-       if (err)
-               goto err;
-
         err = insert_vm_struct(mm, vma);
         if (err)
                 goto err;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c

index b05acb7961355dfb680e49f3145a11065f6ac851..b0201ca6e9c6e0b7837917420bb3dfe1dc06b88f 100644 (file)
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -304,24 +304,23 @@ out:
  
  /**
   * export_encode_fh - default export_operations->encode_fh function
- * @dentry:  the dentry to encode
+ * @inode:   the object to encode
   * @fh:      where to store the file handle fragment
   * @max_len: maximum length to store there
- * @connectable: whether to store parent information
+ * @parent:  parent directory inode, if wanted
   *
   * This default encode_fh function assumes that the 32 inode number
   * is suitable for locating an inode, and that the generation number
   * can be used to check that it is still valid.  It places them in the
   * filehandle fragment where export_decode_fh expects to find them.
   */
-static int export_encode_fh(struct dentry *dentry, struct fid *fid,
-               int *max_len, int connectable)
+static int export_encode_fh(struct inode *inode, struct fid *fid,
+               int *max_len, struct inode *parent)
  {
-       struct inode * inode = dentry->d_inode;
         int len = *max_len;
         int type = FILEID_INO32_GEN;
  
-       if (connectable && (len < 4)) {
+       if (parent && (len < 4)) {
                 *max_len = 4;
                 return 255;
         } else if (len < 2) {
@@ -332,14 +331,9 @@ static int export_encode_fh(struct dentry *dentry, struct fid *fid,
         len = 2;
         fid->i32.ino = inode->i_ino;
         fid->i32.gen = inode->i_generation;
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                 fid->i32.parent_ino = parent->i_ino;
                 fid->i32.parent_gen = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
                 len = 4;
                 type = FILEID_INO32_GEN_PARENT;
         }
@@ -352,11 +346,22 @@ int exportfs_encode_fh(struct dentry *dentry, struct fid *fid, int *max_len,
  {
         const struct export_operations *nop = dentry->d_sb->s_export_op;
         int error;
+       struct dentry *p = NULL;
+       struct inode *inode = dentry->d_inode, *parent = NULL;
  
+       if (connectable && !S_ISDIR(inode->i_mode)) {
+               p = dget_parent(dentry);
+               /*
+                * note that while p might've ceased to be our parent already,
+                * it's still pinned by and still positive.
+                */
+               parent = p->d_inode;
+       }
         if (nop->encode_fh)
-               error = nop->encode_fh(dentry, fid->raw, max_len, connectable);
+               error = nop->encode_fh(inode, fid->raw, max_len, parent);
         else
-               error = export_encode_fh(dentry, fid, max_len, connectable);
+               error = export_encode_fh(inode, fid, max_len, parent);
+       dput(p);
  
         return error;
  }
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig

index 9ed1bb1f319f381b700a6386a4d8d068d04e0fdf..c22f17021b6eee7ca942a3525eb9f4fd23de6011 100644 (file)
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -2,6 +2,8 @@ config EXT4_FS
         tristate "The Extended 4 (ext4) filesystem"
         select JBD2
         select CRC16
+       select CRYPTO
+       select CRYPTO_CRC32C
         help
           This is the next generation of the ext3 filesystem.
  
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c

index c45c41129a35b7346463e0f18e847b78a52e0426..99b6324290db916466d8b5c0633e9fa216d21798 100644 (file)
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -168,12 +168,14 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
  
         /* If checksum is bad mark all blocks used to prevent allocation
          * essentially implementing a per-group read-only flag. */
-       if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+       if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                 ext4_error(sb, "Checksum bad for group %u", block_group);
                 ext4_free_group_clusters_set(sb, gdp, 0);
                 ext4_free_inodes_set(sb, gdp, 0);
                 ext4_itable_unused_set(sb, gdp, 0);
                 memset(bh->b_data, 0xff, sb->s_blocksize);
+               ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
+                                          EXT4_BLOCKS_PER_GROUP(sb) / 8);
                 return;
         }
         memset(bh->b_data, 0, sb->s_blocksize);
@@ -210,6 +212,9 @@ void ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
          */
         ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group),
                              sb->s_blocksize * 8, bh->b_data);
+       ext4_block_bitmap_csum_set(sb, block_group, gdp, bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
  }
  
  /* Return the number of free blocks in a block group.  It is used when
@@ -276,9 +281,9 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
  }
  
  static int ext4_valid_block_bitmap(struct super_block *sb,
-                                       struct ext4_group_desc *desc,
-                                       unsigned int block_group,
-                                       struct buffer_head *bh)
+                                  struct ext4_group_desc *desc,
+                                  unsigned int block_group,
+                                  struct buffer_head *bh)
  {
         ext4_grpblk_t offset;
         ext4_grpblk_t next_zero_bit;
@@ -325,6 +330,23 @@ err_out:
                         block_group, bitmap_blk);
         return 0;
  }
+
+void ext4_validate_block_bitmap(struct super_block *sb,
+                              struct ext4_group_desc *desc,
+                              unsigned int block_group,
+                              struct buffer_head *bh)
+{
+       if (buffer_verified(bh))
+               return;
+
+       ext4_lock_group(sb, block_group);
+       if (ext4_valid_block_bitmap(sb, desc, block_group, bh) &&
+           ext4_block_bitmap_csum_verify(sb, block_group, desc, bh,
+                                         EXT4_BLOCKS_PER_GROUP(sb) / 8))
+               set_buffer_verified(bh);
+       ext4_unlock_group(sb, block_group);
+}
+
  /**
   * ext4_read_block_bitmap()
   * @sb:                        super block
@@ -355,12 +377,12 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
         }
  
         if (bitmap_uptodate(bh))
-               return bh;
+               goto verify;
  
         lock_buffer(bh);
         if (bitmap_uptodate(bh)) {
                 unlock_buffer(bh);
-               return bh;
+               goto verify;
         }
         ext4_lock_group(sb, block_group);
         if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -379,7 +401,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
                  */
                 set_bitmap_uptodate(bh);
                 unlock_buffer(bh);
-               return bh;
+               goto verify;
         }
         /*
          * submit the buffer_head for reading
@@ -390,6 +412,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
         get_bh(bh);
         submit_bh(READ, bh);
         return bh;
+verify:
+       ext4_validate_block_bitmap(sb, desc, block_group, bh);
+       return bh;
  }
  
  /* Returns 0 on success, 1 on error */
@@ -412,7 +437,7 @@ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
         }
         clear_buffer_new(bh);
         /* Panic or remount fs read-only if block bitmap is invalid */
-       ext4_valid_block_bitmap(sb, desc, block_group, bh);
+       ext4_validate_block_bitmap(sb, desc, block_group, bh);
         return 0;
  }
  
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c

index fa3af81ac565c16dba6237edc89c3c7d70c5fc61..b319721da26ae32010adcd46db7e2d98ec50887a 100644 (file)
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -29,3 +29,86 @@ unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars)
  
  #endif  /*  EXT4FS_DEBUG  */
  
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz)
+{
+       __u32 hi;
+       __u32 provided, calculated;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
+       calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
+               hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
+               provided |= (hi << 16);
+       } else
+               calculated &= 0xFFFF;
+
+       return provided == calculated;
+}
+
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz)
+{
+       __u32 csum;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+       if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
+               gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
+
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz)
+{
+       __u32 hi;
+       __u32 provided, calculated;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
+       calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
+               hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
+               provided |= (hi << 16);
+       } else
+               calculated &= 0xFFFF;
+
+       if (provided == calculated)
+               return 1;
+
+       ext4_error(sb, "Bad block bitmap checksum: block_group = %u", group);
+       return 0;
+}
+
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz)
+{
+       __u32 csum;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+       gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
+       if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
+               gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
+}
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c

index b86786202643bdd8044ee85fb72a0a21bc2c9bef..aa39e600d15954244aead38f7aed30513ce86d65 100644 (file)
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -179,6 +179,18 @@ static int ext4_readdir(struct file *filp,
                         continue;
                 }
  
+               /* Check the checksum */
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(inode,
+                               (struct ext4_dir_entry *)bh->b_data)) {
+                       EXT4_ERROR_FILE(filp, 0, "directory fails checksum "
+                                       "at offset %llu",
+                                       (unsigned long long)filp->f_pos);
+                       filp->f_pos += sb->s_blocksize - offset;
+                       continue;
+               }
+               set_buffer_verified(bh);
+
  revalidate:
                 /* If the dir block has changed since the last call to
                  * readdir(2), then we might be pointing to an invalid
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index c21b1de51afbb42191adea4fc4a357e3906c8489..cfc4e01b3c8370c642681824ef55b13a66683c0d 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,7 @@
  #include <linux/wait.h>
  #include <linux/blockgroup_lock.h>
  #include <linux/percpu_counter.h>
+#include <crypto/hash.h>
  #ifdef __KERNEL__
  #include <linux/compat.h>
  #endif
@@ -298,7 +299,9 @@ struct ext4_group_desc
         __le16  bg_free_inodes_count_lo;/* Free inodes count */
         __le16  bg_used_dirs_count_lo;  /* Directories count */
         __le16  bg_flags;               /* EXT4_BG_flags (INODE_UNINIT, etc) */
-       __u32   bg_reserved[2];         /* Likely block/inode bitmap checksum */
+       __le32  bg_exclude_bitmap_lo;   /* Exclude bitmap for snapshots */
+       __le16  bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bbitmap) LE */
+       __le16  bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+ibitmap) LE */
         __le16  bg_itable_unused_lo;    /* Unused inodes count */
         __le16  bg_checksum;            /* crc16(sb_uuid+group+desc) */
         __le32  bg_block_bitmap_hi;     /* Blocks bitmap block MSB */
@@ -308,9 +311,19 @@ struct ext4_group_desc
         __le16  bg_free_inodes_count_hi;/* Free inodes count MSB */
         __le16  bg_used_dirs_count_hi;  /* Directories count MSB */
         __le16  bg_itable_unused_hi;    /* Unused inodes count MSB */
-       __u32   bg_reserved2[3];
+       __le32  bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
+       __le16  bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bbitmap) BE */
+       __le16  bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+ibitmap) BE */
+       __u32   bg_reserved;
  };
  
+#define EXT4_BG_INODE_BITMAP_CSUM_HI_END       \
+       (offsetof(struct ext4_group_desc, bg_inode_bitmap_csum_hi) + \
+        sizeof(__le16))
+#define EXT4_BG_BLOCK_BITMAP_CSUM_HI_END       \
+       (offsetof(struct ext4_group_desc, bg_block_bitmap_csum_hi) + \
+        sizeof(__le16))
+
  /*
   * Structure of a flex block group info
   */
@@ -650,7 +663,8 @@ struct ext4_inode {
                         __le16  l_i_file_acl_high;
                         __le16  l_i_uid_high;   /* these 2 fields */
                         __le16  l_i_gid_high;   /* were reserved2[0] */
-                       __u32   l_i_reserved2;
+                       __le16  l_i_checksum_lo;/* crc32c(uuid+inum+inode) LE */
+                       __le16  l_i_reserved;
                 } linux2;
                 struct {
                         __le16  h_i_reserved1;  /* Obsoleted fragment number/size which are removed in ext4 */
@@ -666,7 +680,7 @@ struct ext4_inode {
                 } masix2;
         } osd2;                         /* OS dependent 2 */
         __le16  i_extra_isize;
-       __le16  i_pad1;
+       __le16  i_checksum_hi;  /* crc32c(uuid+inum+inode) BE */
         __le32  i_ctime_extra;  /* extra Change time      (nsec << 2 | epoch) */
         __le32  i_mtime_extra;  /* extra Modification time(nsec << 2 | epoch) */
         __le32  i_atime_extra;  /* extra Access time      (nsec << 2 | epoch) */
@@ -768,7 +782,7 @@ do {                                                                               \
  #define i_gid_low      i_gid
  #define i_uid_high     osd2.linux2.l_i_uid_high
  #define i_gid_high     osd2.linux2.l_i_gid_high
-#define i_reserved2    osd2.linux2.l_i_reserved2
+#define i_checksum_lo  osd2.linux2.l_i_checksum_lo
  
  #elif defined(__GNU__)
  
@@ -908,6 +922,9 @@ struct ext4_inode_info {
          */
         tid_t i_sync_tid;
         tid_t i_datasync_tid;
+
+       /* Precomputed uuid+inum+igen checksum for seeding inode checksums */
+       __u32 i_csum_seed;
  };
  
  /*
@@ -1001,6 +1018,9 @@ extern void ext4_set_bits(void *bm, int cur, int len);
  #define EXT4_ERRORS_PANIC              3       /* Panic */
  #define EXT4_ERRORS_DEFAULT            EXT4_ERRORS_CONTINUE
  
+/* Metadata checksum algorithm codes */
+#define EXT4_CRC32C_CHKSUM             1
+
  /*
   * Structure of the super block
   */
@@ -1087,7 +1107,7 @@ struct ext4_super_block {
         __le64  s_mmp_block;            /* Block for multi-mount protection */
         __le32  s_raid_stripe_width;    /* blocks on all data disks (N*stride)*/
         __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
-       __u8    s_reserved_char_pad;
+       __u8    s_checksum_type;        /* metadata checksum algorithm used */
         __le16  s_reserved_pad;
         __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
         __le32  s_snapshot_inum;        /* Inode number of active snapshot */
@@ -1113,7 +1133,8 @@ struct ext4_super_block {
         __le32  s_usr_quota_inum;       /* inode for tracking user quota */
         __le32  s_grp_quota_inum;       /* inode for tracking group quota */
         __le32  s_overhead_clusters;    /* overhead blocks/clusters in fs */
-       __le32  s_reserved[109];        /* Padding to the end of the block */
+       __le32  s_reserved[108];        /* Padding to the end of the block */
+       __le32  s_checksum;             /* crc32c(superblock) */
  };
  
  #define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START)
@@ -1176,6 +1197,7 @@ struct ext4_sb_info {
         struct proc_dir_entry *s_proc;
         struct kobject s_kobj;
         struct completion s_kobj_unregister;
+       struct super_block *s_sb;
  
         /* Journaling */
         struct journal_s *s_journal;
@@ -1266,6 +1288,12 @@ struct ext4_sb_info {
  
         /* record the last minlen when FITRIM is called. */
         atomic_t s_last_trim_minblks;
+
+       /* Reference to checksum algorithm driver via cryptoapi */
+       struct crypto_shash *s_chksum_driver;
+
+       /* Precomputed FS UUID checksum for seeding other checksums */
+       __u32 s_csum_seed;
  };
  
  static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1414,6 +1442,12 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
  #define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE     0x0040
  #define EXT4_FEATURE_RO_COMPAT_QUOTA           0x0100
  #define EXT4_FEATURE_RO_COMPAT_BIGALLOC                0x0200
+/*
+ * METADATA_CSUM also enables group descriptor checksums (GDT_CSUM).  When
+ * METADATA_CSUM is set, group descriptor checksums use the same algorithm as
+ * all other data structures' checksums.  However, the METADATA_CSUM and
+ * GDT_CSUM bits are mutually exclusive.
+ */
  #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM   0x0400
  
  #define EXT4_FEATURE_INCOMPAT_COMPRESSION      0x0001
@@ -1461,7 +1495,8 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei)
                                          EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \
                                          EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\
                                          EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
-                                        EXT4_FEATURE_RO_COMPAT_BIGALLOC)
+                                        EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
+                                        EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)
  
  /*
   * Default values for user and/or group using reserved blocks
@@ -1526,6 +1561,18 @@ struct ext4_dir_entry_2 {
         char    name[EXT4_NAME_LEN];    /* File name */
  };
  
+/*
+ * This is a bogus directory entry at the end of each leaf block that
+ * records checksums.
+ */
+struct ext4_dir_entry_tail {
+       __le32  det_reserved_zero1;     /* Pretend to be unused */
+       __le16  det_rec_len;            /* 12 */
+       __u8    det_reserved_zero2;     /* Zero name length */
+       __u8    det_reserved_ft;        /* 0xDE, fake file type */
+       __le32  det_checksum;           /* crc32c(uuid+inum+dirblock) */
+};
+
  /*
   * Ext4 directory file types.  Only the low 3 bits are used.  The
   * other bits are reserved for now.
@@ -1541,6 +1588,8 @@ struct ext4_dir_entry_2 {
  
  #define EXT4_FT_MAX            8
  
+#define EXT4_FT_DIR_CSUM       0xDE
+
  /*
   * EXT4_DIR_PAD defines the directory entries boundaries
   *
@@ -1609,6 +1658,25 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
  #define DX_HASH_HALF_MD4_UNSIGNED      4
  #define DX_HASH_TEA_UNSIGNED           5
  
+static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
+                             const void *address, unsigned int length)
+{
+       struct {
+               struct shash_desc shash;
+               char ctx[crypto_shash_descsize(sbi->s_chksum_driver)];
+       } desc;
+       int err;
+
+       desc.shash.tfm = sbi->s_chksum_driver;
+       desc.shash.flags = 0;
+       *(u32 *)desc.ctx = crc;
+
+       err = crypto_shash_update(&desc.shash, address, length);
+       BUG_ON(err);
+
+       return *(u32 *)desc.ctx;
+}
+
  #ifdef __KERNEL__
  
  /* hash info structure used by the directory hash */
@@ -1741,7 +1809,8 @@ struct mmp_struct {
         __le16  mmp_check_interval;
  
         __le16  mmp_pad1;
-       __le32  mmp_pad2[227];
+       __le32  mmp_pad2[226];
+       __le32  mmp_checksum;           /* crc32c(uuid+mmp_block) */
  };
  
  /* arguments passed to the mmp thread */
@@ -1784,8 +1853,24 @@ struct mmpd_data {
  
  /* bitmap.c */
  extern unsigned int ext4_count_free(struct buffer_head *, unsigned);
+void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz);
+int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz);
+void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group,
+                               struct ext4_group_desc *gdp,
+                               struct buffer_head *bh, int sz);
+int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group,
+                                 struct ext4_group_desc *gdp,
+                                 struct buffer_head *bh, int sz);
  
  /* balloc.c */
+extern void ext4_validate_block_bitmap(struct super_block *sb,
+                                      struct ext4_group_desc *desc,
+                                      unsigned int block_group,
+                                      struct buffer_head *bh);
  extern unsigned int ext4_block_group(struct super_block *sb,
                         ext4_fsblk_t blocknr);
  extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
@@ -1864,7 +1949,7 @@ extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
  /* mballoc.c */
  extern long ext4_mb_stats;
  extern long ext4_mb_max_to_scan;
-extern int ext4_mb_init(struct super_block *, int);
+extern int ext4_mb_init(struct super_block *);
  extern int ext4_mb_release(struct super_block *);
  extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
                                 struct ext4_allocation_request *, int *);
@@ -1936,6 +2021,8 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
  extern int ext4_ext_migrate(struct inode *);
  
  /* namei.c */
+extern int ext4_dirent_csum_verify(struct inode *inode,
+                                  struct ext4_dir_entry *dirent);
  extern int ext4_orphan_add(handle_t *, struct inode *);
  extern int ext4_orphan_del(handle_t *, struct inode *);
  extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
@@ -1950,6 +2037,10 @@ extern int ext4_group_extend(struct super_block *sb,
  extern int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count);
  
  /* super.c */
+extern int ext4_superblock_csum_verify(struct super_block *sb,
+                                      struct ext4_super_block *es);
+extern void ext4_superblock_csum_set(struct super_block *sb,
+                                    struct ext4_super_block *es);
  extern void *ext4_kvmalloc(size_t size, gfp_t flags);
  extern void *ext4_kvzalloc(size_t size, gfp_t flags);
  extern void ext4_kvfree(void *ptr);
@@ -2025,10 +2116,17 @@ extern void ext4_used_dirs_set(struct super_block *sb,
                                 struct ext4_group_desc *bg, __u32 count);
  extern void ext4_itable_unused_set(struct super_block *sb,
                                    struct ext4_group_desc *bg, __u32 count);
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-                                  struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+extern int ext4_group_desc_csum_verify(struct super_block *sb, __u32 group,
                                        struct ext4_group_desc *gdp);
+extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group,
+                                    struct ext4_group_desc *gdp);
+
+static inline int ext4_has_group_desc_csum(struct super_block *sb)
+{
+       return EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                         EXT4_FEATURE_RO_COMPAT_GDT_CSUM |
+                                         EXT4_FEATURE_RO_COMPAT_METADATA_CSUM);
+}
  
  static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
  {
@@ -2225,6 +2323,9 @@ static inline void ext4_unlock_group(struct super_block *sb,
  
  static inline void ext4_mark_super_dirty(struct super_block *sb)
  {
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+       ext4_superblock_csum_set(sb, es);
         if (EXT4_SB(sb)->s_journal == NULL)
                 sb->s_dirt =1;
  }
@@ -2314,6 +2415,9 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
  
  /* mmp.c */
  extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
+extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
+extern int ext4_mmp_csum_verify(struct super_block *sb,
+                               struct mmp_struct *mmp);
  
  /* BH_Uninit flag: blocks are allocated but uninitialized on disk */
  enum ext4_state_bits {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h

index 0f58b86e3a0206e19626f361f453aa80b2838857..cb1b2c919963290fd10d09ba12f6d8c53ace9fa6 100644 (file)
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -63,8 +63,21 @@
   * ext4_inode has i_block array (60 bytes total).
   * The first 12 bytes store ext4_extent_header;
   * the remainder stores an array of ext4_extent.
+ * For non-inode extent blocks, ext4_extent_tail
+ * follows the array.
   */
  
+/*
+ * This is the extent tail on-disk structure.
+ * All other extent structures are 12 bytes long.  It turns out that
+ * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which
+ * covers all valid ext4 block sizes.  Therefore, this tail structure can be
+ * crammed into the end of the block without having to rebalance the tree.
+ */
+struct ext4_extent_tail {
+       __le32  et_checksum;    /* crc32c(uuid+inum+extent_block) */
+};
+
  /*
   * This is the extent on-disk structure.
   * It's used at the bottom of the tree.
@@ -101,6 +114,17 @@ struct ext4_extent_header {
  
  #define EXT4_EXT_MAGIC         cpu_to_le16(0xf30a)
  
+#define EXT4_EXTENT_TAIL_OFFSET(hdr) \
+       (sizeof(struct ext4_extent_header) + \
+        (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max)))
+
+static inline struct ext4_extent_tail *
+find_ext4_extent_tail(struct ext4_extent_header *eh)
+{
+       return (struct ext4_extent_tail *)(((void *)eh) +
+                                          EXT4_EXTENT_TAIL_OFFSET(eh));
+}
+
  /*
   * Array of ext4_ext_path contains path to some extent.
   * Creation/lookup routines use it for traversal/splitting/etc.
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c

index aca17901758249d4329d780714e328ef42851e35..90f7c2e84db1bef3fdb90931f9124cfe052705dd 100644 (file)
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -138,16 +138,23 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
  }
  
  int __ext4_handle_dirty_super(const char *where, unsigned int line,
-                             handle_t *handle, struct super_block *sb)
+                             handle_t *handle, struct super_block *sb,
+                             int now)
  {
         struct buffer_head *bh = EXT4_SB(sb)->s_sbh;
         int err = 0;
  
         if (ext4_handle_valid(handle)) {
+               ext4_superblock_csum_set(sb,
+                               (struct ext4_super_block *)bh->b_data);
                 err = jbd2_journal_dirty_metadata(handle, bh);
                 if (err)
                         ext4_journal_abort_handle(where, line, __func__,
                                                   bh, handle, err);
+       } else if (now) {
+               ext4_superblock_csum_set(sb,
+                               (struct ext4_super_block *)bh->b_data);
+               mark_buffer_dirty(bh);
         } else
                 sb->s_dirt = 1;
         return err;
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h

index 83b20fcf9400b11b28185470f8feef309faa8252..f440e8f1841f4e2521486bd94ae19ed83aa896ab 100644 (file)
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -213,7 +213,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line,
                                  struct buffer_head *bh);
  
  int __ext4_handle_dirty_super(const char *where, unsigned int line,
-                             handle_t *handle, struct super_block *sb);
+                             handle_t *handle, struct super_block *sb,
+                             int now);
  
  #define ext4_journal_get_write_access(handle, bh) \
         __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh))
@@ -225,8 +226,10 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
  #define ext4_handle_dirty_metadata(handle, inode, bh) \
         __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \
                                      (bh))
+#define ext4_handle_dirty_super_now(handle, sb) \
+       __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 1)
  #define ext4_handle_dirty_super(handle, sb) \
-       __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
+       __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb), 0)
  
  handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks);
  int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c

index abcdeab67f5232b66d4aa5a6cbb88838094f6247..91341ec6e06a94f2f400d10039a64585cd17ed2e 100644 (file)
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -52,6 +52,46 @@
  #define EXT4_EXT_MARK_UNINIT1  0x2  /* mark first half uninitialized */
  #define EXT4_EXT_MARK_UNINIT2  0x4  /* mark second half uninitialized */
  
+static __le32 ext4_extent_block_csum(struct inode *inode,
+                                    struct ext4_extent_header *eh)
+{
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+                          EXT4_EXTENT_TAIL_OFFSET(eh));
+       return cpu_to_le32(csum);
+}
+
+static int ext4_extent_block_csum_verify(struct inode *inode,
+                                        struct ext4_extent_header *eh)
+{
+       struct ext4_extent_tail *et;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       et = find_ext4_extent_tail(eh);
+       if (et->et_checksum != ext4_extent_block_csum(inode, eh))
+               return 0;
+       return 1;
+}
+
+static void ext4_extent_block_csum_set(struct inode *inode,
+                                      struct ext4_extent_header *eh)
+{
+       struct ext4_extent_tail *et;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       et = find_ext4_extent_tail(eh);
+       et->et_checksum = ext4_extent_block_csum(inode, eh);
+}
+
  static int ext4_split_extent(handle_t *handle,
                                 struct inode *inode,
                                 struct ext4_ext_path *path,
@@ -117,6 +157,7 @@ static int __ext4_ext_dirty(const char *where, unsigned int line,
  {
         int err;
         if (path->p_bh) {
+               ext4_extent_block_csum_set(inode, ext_block_hdr(path->p_bh));
                 /* path points to block */
                 err = __ext4_handle_dirty_metadata(where, line, handle,
                                                    inode, path->p_bh);
@@ -391,6 +432,12 @@ static int __ext4_ext_check(const char *function, unsigned int line,
                 error_msg = "invalid extent entries";
                 goto corrupted;
         }
+       /* Verify checksum on non-root extent tree nodes */
+       if (ext_depth(inode) != depth &&
+           !ext4_extent_block_csum_verify(inode, eh)) {
+               error_msg = "extent tree corrupted";
+               goto corrupted;
+       }
         return 0;
  
  corrupted:
@@ -412,6 +459,26 @@ int ext4_ext_check_inode(struct inode *inode)
         return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
  }
  
+static int __ext4_ext_check_block(const char *function, unsigned int line,
+                                 struct inode *inode,
+                                 struct ext4_extent_header *eh,
+                                 int depth,
+                                 struct buffer_head *bh)
+{
+       int ret;
+
+       if (buffer_verified(bh))
+               return 0;
+       ret = ext4_ext_check(inode, eh, depth);
+       if (ret)
+               return ret;
+       set_buffer_verified(bh);
+       return ret;
+}
+
+#define ext4_ext_check_block(inode, eh, depth, bh)     \
+       __ext4_ext_check_block(__func__, __LINE__, inode, eh, depth, bh)
+
  #ifdef EXT_DEBUG
  static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
  {
@@ -536,7 +603,7 @@ ext4_ext_binsearch_idx(struct inode *inode,
         }
  
         path->p_idx = l - 1;
-       ext_debug("  -> %d->%lld ", le32_to_cpu(path->p_idx->ei_block),
+       ext_debug("  -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block),
                   ext4_idx_pblock(path->p_idx));
  
  #ifdef CHECK_BINSEARCH
@@ -668,8 +735,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
         i = depth;
         /* walk through the tree */
         while (i) {
-               int need_to_validate = 0;
-
                 ext_debug("depth %d: num %d, max %d\n",
                           ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
  
@@ -688,8 +753,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                                 put_bh(bh);
                                 goto err;
                         }
-                       /* validate the extent entries */
-                       need_to_validate = 1;
                 }
                 eh = ext_block_hdr(bh);
                 ppos++;
@@ -703,7 +766,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                 path[ppos].p_hdr = eh;
                 i--;
  
-               if (need_to_validate && ext4_ext_check(inode, eh, i))
+               if (ext4_ext_check_block(inode, eh, i, bh))
                         goto err;
         }
  
@@ -914,6 +977,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                 le16_add_cpu(&neh->eh_entries, m);
         }
  
+       ext4_extent_block_csum_set(inode, neh);
         set_buffer_uptodate(bh);
         unlock_buffer(bh);
  
@@ -992,6 +1056,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                                 sizeof(struct ext4_extent_idx) * m);
                         le16_add_cpu(&neh->eh_entries, m);
                 }
+               ext4_extent_block_csum_set(inode, neh);
                 set_buffer_uptodate(bh);
                 unlock_buffer(bh);
  
@@ -1089,6 +1154,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
         else
                 neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
         neh->eh_magic = EXT4_EXT_MAGIC;
+       ext4_extent_block_csum_set(inode, neh);
         set_buffer_uptodate(bh);
         unlock_buffer(bh);
  
@@ -1344,7 +1410,8 @@ got_index:
                         return -EIO;
                 eh = ext_block_hdr(bh);
                 /* subtract from p_depth to get proper eh_depth */
-               if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+               if (ext4_ext_check_block(inode, eh,
+                                        path->p_depth - depth, bh)) {
                         put_bh(bh);
                         return -EIO;
                 }
@@ -1357,7 +1424,7 @@ got_index:
         if (bh == NULL)
                 return -EIO;
         eh = ext_block_hdr(bh);
-       if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
+       if (ext4_ext_check_block(inode, eh, path->p_depth - depth, bh)) {
                 put_bh(bh);
                 return -EIO;
         }
@@ -2644,8 +2711,8 @@ cont:
                                 err = -EIO;
                                 break;
                         }
-                       if (ext4_ext_check(inode, ext_block_hdr(bh),
-                                                       depth - i - 1)) {
+                       if (ext4_ext_check_block(inode, ext_block_hdr(bh),
+                                                       depth - i - 1, bh)) {
                                 err = -EIO;
                                 break;
                         }
@@ -4722,8 +4789,8 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length)
  
         /* Now release the pages */
         if (last_page_offset > first_page_offset) {
-               truncate_inode_pages_range(mapping, first_page_offset,
-                                          last_page_offset-1);
+               truncate_pagecache_range(inode, first_page_offset,
+                                        last_page_offset - 1);
         }
  
         /* finish any pending end_io work */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c

index cb70f1812a70f5ca8452e98776cd309ad6638055..8c7642a00054fd1ddf649e733e4b6efb5a0eb14b 100644 (file)
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
  {
         struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
         int unaligned_aio = 0;
-       int ret;
+       ssize_t ret;
  
         /*
          * If we have encountered a bitmap-format file, the size limit
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c

index 9f9acac6c43f4ac8006e363b5567b4a88dd56615..d48e8b14928cf993c50c33fe9b18a90203c2c492 100644 (file)
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -70,24 +70,27 @@ static unsigned ext4_init_inode_bitmap(struct super_block *sb,
                                        ext4_group_t block_group,
                                        struct ext4_group_desc *gdp)
  {
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
-
         J_ASSERT_BH(bh, buffer_locked(bh));
  
         /* If checksum is bad mark all blocks and inodes use to prevent
          * allocation, essentially implementing a per-group read-only flag. */
-       if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) {
+       if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) {
                 ext4_error(sb, "Checksum bad for group %u", block_group);
                 ext4_free_group_clusters_set(sb, gdp, 0);
                 ext4_free_inodes_set(sb, gdp, 0);
                 ext4_itable_unused_set(sb, gdp, 0);
                 memset(bh->b_data, 0xff, sb->s_blocksize);
+               ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+                                          EXT4_INODES_PER_GROUP(sb) / 8);
                 return 0;
         }
  
         memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8);
         ext4_mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8,
                         bh->b_data);
+       ext4_inode_bitmap_csum_set(sb, block_group, gdp, bh,
+                                  EXT4_INODES_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
  
         return EXT4_INODES_PER_GROUP(sb);
  }
@@ -128,12 +131,12 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                 return NULL;
         }
         if (bitmap_uptodate(bh))
-               return bh;
+               goto verify;
  
         lock_buffer(bh);
         if (bitmap_uptodate(bh)) {
                 unlock_buffer(bh);
-               return bh;
+               goto verify;
         }
  
         ext4_lock_group(sb, block_group);
@@ -141,6 +144,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                 ext4_init_inode_bitmap(sb, bh, block_group, desc);
                 set_bitmap_uptodate(bh);
                 set_buffer_uptodate(bh);
+               set_buffer_verified(bh);
                 ext4_unlock_group(sb, block_group);
                 unlock_buffer(bh);
                 return bh;
@@ -154,7 +158,7 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                  */
                 set_bitmap_uptodate(bh);
                 unlock_buffer(bh);
-               return bh;
+               goto verify;
         }
         /*
          * submit the buffer_head for reading
@@ -171,6 +175,20 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                            block_group, bitmap_blk);
                 return NULL;
         }
+
+verify:
+       ext4_lock_group(sb, block_group);
+       if (!buffer_verified(bh) &&
+           !ext4_inode_bitmap_csum_verify(sb, block_group, desc, bh,
+                                          EXT4_INODES_PER_GROUP(sb) / 8)) {
+               ext4_unlock_group(sb, block_group);
+               put_bh(bh);
+               ext4_error(sb, "Corrupt inode bitmap - block_group = %u, "
+                          "inode_bitmap = %llu", block_group, bitmap_blk);
+               return NULL;
+       }
+       ext4_unlock_group(sb, block_group);
+       set_buffer_verified(bh);
         return bh;
  }
  
@@ -276,7 +294,9 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                 ext4_used_dirs_set(sb, gdp, count);
                 percpu_counter_dec(&sbi->s_dirs_counter);
         }
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+       ext4_inode_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+                                  EXT4_INODES_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
         ext4_unlock_group(sb, block_group);
  
         percpu_counter_inc(&sbi->s_freeinodes_counter);
@@ -488,10 +508,12 @@ fallback_retry:
         for (i = 0; i < ngroups; i++) {
                 grp = (parent_group + i) % ngroups;
                 desc = ext4_get_group_desc(sb, grp, NULL);
-               grp_free = ext4_free_inodes_count(sb, desc);
-               if (desc && grp_free && grp_free >= avefreei) {
-                       *group = grp;
-                       return 0;
+               if (desc) {
+                       grp_free = ext4_free_inodes_count(sb, desc);
+                       if (grp_free && grp_free >= avefreei) {
+                               *group = grp;
+                               return 0;
+                       }
                 }
         }
  
@@ -709,7 +731,7 @@ repeat_in_this_group:
  
  got:
         /* We may have to initialize the block bitmap if it isn't already */
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) &&
+       if (ext4_has_group_desc_csum(sb) &&
             gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                 struct buffer_head *block_bitmap_bh;
  
@@ -731,8 +753,11 @@ got:
                         gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                         ext4_free_group_clusters_set(sb, gdp,
                                 ext4_free_clusters_after_init(sb, group, gdp));
-                       gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
-                                                               gdp);
+                       ext4_block_bitmap_csum_set(sb, group, gdp,
+                                                  block_bitmap_bh,
+                                                  EXT4_BLOCKS_PER_GROUP(sb) /
+                                                  8);
+                       ext4_group_desc_csum_set(sb, group, gdp);
                 }
                 ext4_unlock_group(sb, group);
  
@@ -751,7 +776,7 @@ got:
                 goto fail;
  
         /* Update the relevant bg descriptor fields */
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+       if (ext4_has_group_desc_csum(sb)) {
                 int free;
                 struct ext4_group_info *grp = ext4_get_group_info(sb, group);
  
@@ -772,7 +797,10 @@ got:
                         ext4_itable_unused_set(sb, gdp,
                                         (EXT4_INODES_PER_GROUP(sb) - ino));
                 up_read(&grp->alloc_sem);
+       } else {
+               ext4_lock_group(sb, group);
         }
+
         ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
         if (S_ISDIR(mode)) {
                 ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
@@ -782,10 +810,12 @@ got:
                         atomic_inc(&sbi->s_flex_groups[f].used_dirs);
                 }
         }
-       if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-               gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-               ext4_unlock_group(sb, group);
+       if (ext4_has_group_desc_csum(sb)) {
+               ext4_inode_bitmap_csum_set(sb, group, gdp, inode_bitmap_bh,
+                                          EXT4_INODES_PER_GROUP(sb) / 8);
+               ext4_group_desc_csum_set(sb, group, gdp);
         }
+       ext4_unlock_group(sb, group);
  
         BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
         err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
@@ -850,6 +880,19 @@ got:
         inode->i_generation = sbi->s_next_generation++;
         spin_unlock(&sbi->s_next_gen_lock);
  
+       /* Precompute checksum seed for inode metadata */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               __u32 csum;
+               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+               __le32 inum = cpu_to_le32(inode->i_ino);
+               __le32 gen = cpu_to_le32(inode->i_generation);
+               csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+                                  sizeof(inum));
+               ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+                                             sizeof(gen));
+       }
+
         ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
         ext4_set_inode_state(inode, EXT4_STATE_NEW);
  
@@ -1140,7 +1183,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
  skip_zeroout:
         ext4_lock_group(sb, group);
         gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED);
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+       ext4_group_desc_csum_set(sb, group, gdp);
         ext4_unlock_group(sb, group);
  
         BUFFER_TRACE(group_desc_bh,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index 07eaf565fdcb2ad4fba4f92c6fe55a01b2fea17b..02bc8cbe7281b3d47c3449a1c4b8e4220685ba52 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,6 +47,73 @@
  
  #define MPAGE_DA_EXTENT_TAIL 0x01
  
+static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
+                             struct ext4_inode_info *ei)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       __u16 csum_lo;
+       __u16 csum_hi = 0;
+       __u32 csum;
+
+       csum_lo = raw->i_checksum_lo;
+       raw->i_checksum_lo = 0;
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
+               csum_hi = raw->i_checksum_hi;
+               raw->i_checksum_hi = 0;
+       }
+
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw,
+                          EXT4_INODE_SIZE(inode->i_sb));
+
+       raw->i_checksum_lo = csum_lo;
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+               raw->i_checksum_hi = csum_hi;
+
+       return csum;
+}
+
+static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw,
+                                 struct ext4_inode_info *ei)
+{
+       __u32 provided, calculated;
+
+       if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+           cpu_to_le32(EXT4_OS_LINUX) ||
+           !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       provided = le16_to_cpu(raw->i_checksum_lo);
+       calculated = ext4_inode_csum(inode, raw, ei);
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+               provided |= ((__u32)le16_to_cpu(raw->i_checksum_hi)) << 16;
+       else
+               calculated &= 0xFFFF;
+
+       return provided == calculated;
+}
+
+static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
+                               struct ext4_inode_info *ei)
+{
+       __u32 csum;
+
+       if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
+           cpu_to_le32(EXT4_OS_LINUX) ||
+           !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       csum = ext4_inode_csum(inode, raw, ei);
+       raw->i_checksum_lo = cpu_to_le16(csum & 0xFFFF);
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+           EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi))
+               raw->i_checksum_hi = cpu_to_le16(csum >> 16);
+}
+
  static inline int ext4_begin_ordered_truncate(struct inode *inode,
                                               loff_t new_size)
  {
@@ -3517,8 +3584,7 @@ make_io:
                                 b = table;
                         end = b + EXT4_SB(sb)->s_inode_readahead_blks;
                         num = EXT4_INODES_PER_GROUP(sb);
-                       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                      EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+                       if (ext4_has_group_desc_csum(sb))
                                 num -= ext4_itable_unused_count(sb, gdp);
                         table += num / inodes_per_block;
                         if (end > table)
@@ -3646,6 +3712,39 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         if (ret < 0)
                 goto bad_inode;
         raw_inode = ext4_raw_inode(&iloc);
+
+       if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
+               ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
+               if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
+                   EXT4_INODE_SIZE(inode->i_sb)) {
+                       EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
+                               EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
+                               EXT4_INODE_SIZE(inode->i_sb));
+                       ret = -EIO;
+                       goto bad_inode;
+               }
+       } else
+               ei->i_extra_isize = 0;
+
+       /* Precompute checksum seed for inode metadata */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+               __u32 csum;
+               __le32 inum = cpu_to_le32(inode->i_ino);
+               __le32 gen = raw_inode->i_generation;
+               csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+                                  sizeof(inum));
+               ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
+                                             sizeof(gen));
+       }
+
+       if (!ext4_inode_csum_verify(inode, raw_inode, ei)) {
+               EXT4_ERROR_INODE(inode, "checksum invalid");
+               ret = -EIO;
+               goto bad_inode;
+       }
+
         inode->i_mode = le16_to_cpu(raw_inode->i_mode);
         i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
         i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
@@ -3725,12 +3824,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
         }
  
         if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
-               ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
-               if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
-                   EXT4_INODE_SIZE(inode->i_sb)) {
-                       ret = -EIO;
-                       goto bad_inode;
-               }
                 if (ei->i_extra_isize == 0) {
                         /* The extra space is currently unused. Use it. */
                         ei->i_extra_isize = sizeof(struct ext4_inode) -
@@ -3742,8 +3835,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                         if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC))
                                 ext4_set_inode_state(inode, EXT4_STATE_XATTR);
                 }
-       } else
-               ei->i_extra_isize = 0;
+       }
  
         EXT4_INODE_GET_XTIME(i_ctime, inode, raw_inode);
         EXT4_INODE_GET_XTIME(i_mtime, inode, raw_inode);
@@ -3942,7 +4034,7 @@ static int ext4_do_update_inode(handle_t *handle,
                         EXT4_SET_RO_COMPAT_FEATURE(sb,
                                         EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
                         ext4_handle_sync(handle);
-                       err = ext4_handle_dirty_super(handle, sb);
+                       err = ext4_handle_dirty_super_now(handle, sb);
                 }
         }
         raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -3969,6 +4061,8 @@ static int ext4_do_update_inode(handle_t *handle,
                 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
         }
  
+       ext4_inode_csum_set(inode, raw_inode, ei);
+
         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
         rc = ext4_handle_dirty_metadata(handle, NULL, bh);
         if (!err)
@@ -4213,7 +4307,8 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
          * will return the blocks that include the delayed allocation
          * blocks for this file.
          */
-       delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks;
+       delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
+                               EXT4_I(inode)->i_reserved_data_blocks);
  
         stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9;
         return 0;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c

index 6eee25591b8159bc96d35a16f94f94c0855a35b9..8ad112ae0ade2f21a953ccc03b687939b0b81310 100644 (file)
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -38,7 +38,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                 handle_t *handle = NULL;
                 int err, migrate = 0;
                 struct ext4_iloc iloc;
-               unsigned int oldflags;
+               unsigned int oldflags, mask, i;
                 unsigned int jflag;
  
                 if (!inode_owner_or_capable(inode))
@@ -115,8 +115,14 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                 if (err)
                         goto flags_err;
  
-               flags = flags & EXT4_FL_USER_MODIFIABLE;
-               flags |= oldflags & ~EXT4_FL_USER_MODIFIABLE;
+               for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+                       if (!(mask & EXT4_FL_USER_MODIFIABLE))
+                               continue;
+                       if (mask & flags)
+                               ext4_set_inode_flag(inode, i);
+                       else
+                               ext4_clear_inode_flag(inode, i);
+               }
                 ei->i_flags = flags;
  
                 ext4_set_inode_flags(inode);
@@ -152,6 +158,13 @@ flags_out:
                 if (!inode_owner_or_capable(inode))
                         return -EPERM;
  
+               if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+                       ext4_warning(sb, "Setting inode version is not "
+                                    "supported with metadata_csum enabled.");
+                       return -ENOTTY;
+               }
+
                 err = mnt_want_write_file(filp);
                 if (err)
                         return err;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c

index 99ab428bcfa089822e74b433aee7b1bf4076e34d..1cd6994fc446008b74dc9b77863edf0f24e14c33 100644 (file)
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -788,7 +788,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
         int first_block;
         struct super_block *sb;
         struct buffer_head *bhs;
-       struct buffer_head **bh;
+       struct buffer_head **bh = NULL;
         struct inode *inode;
         char *data;
         char *bitmap;
@@ -2375,7 +2375,7 @@ static int ext4_groupinfo_create_slab(size_t size)
         return 0;
  }
  
-int ext4_mb_init(struct super_block *sb, int needs_recovery)
+int ext4_mb_init(struct super_block *sb)
  {
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         unsigned i, j;
@@ -2517,6 +2517,9 @@ int ext4_mb_release(struct super_block *sb)
         struct ext4_sb_info *sbi = EXT4_SB(sb);
         struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
  
+       if (sbi->s_proc)
+               remove_proc_entry("mb_groups", sbi->s_proc);
+
         if (sbi->s_group_info) {
                 for (i = 0; i < ngroups; i++) {
                         grinfo = ext4_get_group_info(sb, i);
@@ -2564,8 +2567,6 @@ int ext4_mb_release(struct super_block *sb)
         }
  
         free_percpu(sbi->s_locality_groups);
-       if (sbi->s_proc)
-               remove_proc_entry("mb_groups", sbi->s_proc);
  
         return 0;
  }
@@ -2797,7 +2798,9 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
         }
         len = ext4_free_group_clusters(sb, gdp) - ac->ac_b_ex.fe_len;
         ext4_free_group_clusters_set(sb, gdp, len);
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
+       ext4_block_bitmap_csum_set(sb, ac->ac_b_ex.fe_group, gdp, bitmap_bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, ac->ac_b_ex.fe_group, gdp);
  
         ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
         percpu_counter_sub(&sbi->s_freeclusters_counter, ac->ac_b_ex.fe_len);
@@ -3071,13 +3074,9 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
  static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
  {
         struct ext4_prealloc_space *pa = ac->ac_pa;
-       int len;
-
-       if (pa && pa->pa_type == MB_INODE_PA) {
-               len = ac->ac_b_ex.fe_len;
-               pa->pa_free += len;
-       }
  
+       if (pa && pa->pa_type == MB_INODE_PA)
+               pa->pa_free += ac->ac_b_ex.fe_len;
  }
  
  /*
@@ -4636,6 +4635,7 @@ do_more:
                  */
                 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
                 if (!new_entry) {
+                       ext4_mb_unload_buddy(&e4b);
                         err = -ENOMEM;
                         goto error_return;
                 }
@@ -4659,7 +4659,9 @@ do_more:
  
         ret = ext4_free_group_clusters(sb, gdp) + count_clusters;
         ext4_free_group_clusters_set(sb, gdp, ret);
-       gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
+       ext4_block_bitmap_csum_set(sb, block_group, gdp, bitmap_bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, gdp);
         ext4_unlock_group(sb, block_group);
         percpu_counter_add(&sbi->s_freeclusters_counter, count_clusters);
  
@@ -4803,7 +4805,9 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
         mb_free_blocks(NULL, &e4b, bit, count);
         blk_free_count = blocks_freed + ext4_free_group_clusters(sb, desc);
         ext4_free_group_clusters_set(sb, desc, blk_free_count);
-       desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
+       ext4_block_bitmap_csum_set(sb, block_group, desc, bitmap_bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       ext4_group_desc_csum_set(sb, block_group, desc);
         ext4_unlock_group(sb, block_group);
         percpu_counter_add(&sbi->s_freeclusters_counter,
                            EXT4_B2C(sbi, blocks_freed));
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c

index ed6548d89165e1d9c31118aca21d3e89a3772ab2..f99a1311e84765296b0a0a04534e0be0536915bc 100644 (file)
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -6,12 +6,45 @@
  
  #include "ext4.h"
  
+/* Checksumming functions */
+static __u32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int offset = offsetof(struct mmp_struct, mmp_checksum);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+
+       return cpu_to_le32(csum);
+}
+
+int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp);
+}
+
+void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       mmp->mmp_checksum = ext4_mmp_csum(sb, mmp);
+}
+
  /*
   * Write the MMP block using WRITE_SYNC to try to get the block on-disk
   * faster.
   */
-static int write_mmp_block(struct buffer_head *bh)
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
  {
+       struct mmp_struct *mmp = (struct mmp_struct *)(bh->b_data);
+
+       ext4_mmp_csum_set(sb, mmp);
         mark_buffer_dirty(bh);
         lock_buffer(bh);
         bh->b_end_io = end_buffer_write_sync;
@@ -59,7 +92,8 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
         }
  
         mmp = (struct mmp_struct *)((*bh)->b_data);
-       if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC)
+       if (le32_to_cpu(mmp->mmp_magic) != EXT4_MMP_MAGIC ||
+           !ext4_mmp_csum_verify(sb, mmp))
                 return -EINVAL;
  
         return 0;
@@ -120,7 +154,7 @@ static int kmmpd(void *data)
                 mmp->mmp_time = cpu_to_le64(get_seconds());
                 last_update_time = jiffies;
  
-               retval = write_mmp_block(bh);
+               retval = write_mmp_block(sb, bh);
                 /*
                  * Don't spew too many error messages. Print one every
                  * (s_mmp_update_interval * 60) seconds.
@@ -200,7 +234,7 @@ static int kmmpd(void *data)
         mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
         mmp->mmp_time = cpu_to_le64(get_seconds());
  
-       retval = write_mmp_block(bh);
+       retval = write_mmp_block(sb, bh);
  
  failed:
         kfree(data);
@@ -299,7 +333,7 @@ skip:
         seq = mmp_new_seq();
         mmp->mmp_seq = cpu_to_le32(seq);
  
-       retval = write_mmp_block(bh);
+       retval = write_mmp_block(sb, bh);
         if (retval)
                 goto failed;
  
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c

index e2a3f4b0ff78d6f81fbf2228f12f201e6ab1a024..5845cd97bf8b094b0fc01082279e8d65ee73f241 100644 (file)
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -145,6 +145,14 @@ struct dx_map_entry
         u16 size;
  };
  
+/*
+ * This goes at the end of each htree block.
+ */
+struct dx_tail {
+       u32 dt_reserved;
+       __le32 dt_checksum;     /* crc32c(uuid+inum+dirblock) */
+};
+
  static inline ext4_lblk_t dx_get_block(struct dx_entry *entry);
  static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value);
  static inline unsigned dx_get_hash(struct dx_entry *entry);
@@ -180,6 +188,230 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
  static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                              struct inode *inode);
  
+/* checksumming functions */
+#define EXT4_DIRENT_TAIL(block, blocksize) \
+       ((struct ext4_dir_entry_tail *)(((void *)(block)) + \
+                                       ((blocksize) - \
+                                        sizeof(struct ext4_dir_entry_tail))))
+
+static void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
+                                  unsigned int blocksize)
+{
+       memset(t, 0, sizeof(struct ext4_dir_entry_tail));
+       t->det_rec_len = ext4_rec_len_to_disk(
+                       sizeof(struct ext4_dir_entry_tail), blocksize);
+       t->det_reserved_ft = EXT4_FT_DIR_CSUM;
+}
+
+/* Walk through a dirent block to find a checksum "dirent" at the tail */
+static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
+                                                  struct ext4_dir_entry *de)
+{
+       struct ext4_dir_entry_tail *t;
+
+#ifdef PARANOID
+       struct ext4_dir_entry *d, *top;
+
+       d = de;
+       top = (struct ext4_dir_entry *)(((void *)de) +
+               (EXT4_BLOCK_SIZE(inode->i_sb) -
+               sizeof(struct ext4_dir_entry_tail)));
+       while (d < top && d->rec_len)
+               d = (struct ext4_dir_entry *)(((void *)d) +
+                   le16_to_cpu(d->rec_len));
+
+       if (d != top)
+               return NULL;
+
+       t = (struct ext4_dir_entry_tail *)d;
+#else
+       t = EXT4_DIRENT_TAIL(de, EXT4_BLOCK_SIZE(inode->i_sb));
+#endif
+
+       if (t->det_reserved_zero1 ||
+           le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+           t->det_reserved_zero2 ||
+           t->det_reserved_ft != EXT4_FT_DIR_CSUM)
+               return NULL;
+
+       return t;
+}
+
+static __le32 ext4_dirent_csum(struct inode *inode,
+                              struct ext4_dir_entry *dirent, int size)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+       return cpu_to_le32(csum);
+}
+
+int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+       struct ext4_dir_entry_tail *t;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       t = get_dirent_tail(inode, dirent);
+       if (!t) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+                                "leaf for checksum.  Please run e2fsck -D.");
+               return 0;
+       }
+
+       if (t->det_checksum != ext4_dirent_csum(inode, dirent,
+                                               (void *)t - (void *)dirent))
+               return 0;
+
+       return 1;
+}
+
+static void ext4_dirent_csum_set(struct inode *inode,
+                                struct ext4_dir_entry *dirent)
+{
+       struct ext4_dir_entry_tail *t;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       t = get_dirent_tail(inode, dirent);
+       if (!t) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space in dir "
+                                "leaf for checksum.  Please run e2fsck -D.");
+               return;
+       }
+
+       t->det_checksum = ext4_dirent_csum(inode, dirent,
+                                          (void *)t - (void *)dirent);
+}
+
+static inline int ext4_handle_dirty_dirent_node(handle_t *handle,
+                                               struct inode *inode,
+                                               struct buffer_head *bh)
+{
+       ext4_dirent_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+       return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
+static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
+                                              struct ext4_dir_entry *dirent,
+                                              int *offset)
+{
+       struct ext4_dir_entry *dp;
+       struct dx_root_info *root;
+       int count_offset;
+
+       if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+               count_offset = 8;
+       else if (le16_to_cpu(dirent->rec_len) == 12) {
+               dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
+               if (le16_to_cpu(dp->rec_len) !=
+                   EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+                       return NULL;
+               root = (struct dx_root_info *)(((void *)dp + 12));
+               if (root->reserved_zero ||
+                   root->info_length != sizeof(struct dx_root_info))
+                       return NULL;
+               count_offset = 32;
+       } else
+               return NULL;
+
+       if (offset)
+               *offset = count_offset;
+       return (struct dx_countlimit *)(((void *)dirent) + count_offset);
+}
+
+static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
+                          int count_offset, int count, struct dx_tail *t)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       __u32 csum, old_csum;
+       int size;
+
+       size = count_offset + (count * sizeof(struct dx_entry));
+       old_csum = t->dt_checksum;
+       t->dt_checksum = 0;
+       csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+       csum = ext4_chksum(sbi, csum, (__u8 *)t, sizeof(struct dx_tail));
+       t->dt_checksum = old_csum;
+
+       return cpu_to_le32(csum);
+}
+
+static int ext4_dx_csum_verify(struct inode *inode,
+                              struct ext4_dir_entry *dirent)
+{
+       struct dx_countlimit *c;
+       struct dx_tail *t;
+       int count_offset, limit, count;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       c = get_dx_countlimit(inode, dirent, &count_offset);
+       if (!c) {
+               EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+               return 1;
+       }
+       limit = le16_to_cpu(c->limit);
+       count = le16_to_cpu(c->count);
+       if (count_offset + (limit * sizeof(struct dx_entry)) >
+           EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+                                "tree checksum found.  Run e2fsck -D.");
+               return 1;
+       }
+       t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+       if (t->dt_checksum != ext4_dx_csum(inode, dirent, count_offset,
+                                           count, t))
+               return 0;
+       return 1;
+}
+
+static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent)
+{
+       struct dx_countlimit *c;
+       struct dx_tail *t;
+       int count_offset, limit, count;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       c = get_dx_countlimit(inode, dirent, &count_offset);
+       if (!c) {
+               EXT4_ERROR_INODE(inode, "dir seems corrupt?  Run e2fsck -D.");
+               return;
+       }
+       limit = le16_to_cpu(c->limit);
+       count = le16_to_cpu(c->count);
+       if (count_offset + (limit * sizeof(struct dx_entry)) >
+           EXT4_BLOCK_SIZE(inode->i_sb) - sizeof(struct dx_tail)) {
+               EXT4_ERROR_INODE(inode, "metadata_csum set but no space for "
+                                "tree checksum.  Run e2fsck -D.");
+               return;
+       }
+       t = (struct dx_tail *)(((struct dx_entry *)c) + limit);
+
+       t->dt_checksum = ext4_dx_csum(inode, dirent, count_offset, count, t);
+}
+
+static inline int ext4_handle_dirty_dx_node(handle_t *handle,
+                                           struct inode *inode,
+                                           struct buffer_head *bh)
+{
+       ext4_dx_csum_set(inode, (struct ext4_dir_entry *)bh->b_data);
+       return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
  /*
   * p is at least 6 bytes before the end of page
   */
@@ -239,12 +471,20 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize)
  {
         unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) -
                 EXT4_DIR_REC_LEN(2) - infosize;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               entry_space -= sizeof(struct dx_tail);
         return entry_space / sizeof(struct dx_entry);
  }
  
  static inline unsigned dx_node_limit(struct inode *dir)
  {
         unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0);
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               entry_space -= sizeof(struct dx_tail);
         return entry_space / sizeof(struct dx_entry);
  }
  
@@ -390,6 +630,15 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                 goto fail;
         }
  
+       if (!buffer_verified(bh) &&
+           !ext4_dx_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data)) {
+               ext4_warning(dir->i_sb, "Root failed checksum");
+               brelse(bh);
+               *err = ERR_BAD_DX_DIR;
+               goto fail;
+       }
+       set_buffer_verified(bh);
+
         entries = (struct dx_entry *) (((char *)&root->info) +
                                        root->info.info_length);
  
@@ -450,6 +699,17 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                 if (!(bh = ext4_bread (NULL,dir, dx_get_block(at), 0, err)))
                         goto fail2;
                 at = entries = ((struct dx_node *) bh->b_data)->entries;
+
+               if (!buffer_verified(bh) &&
+                   !ext4_dx_csum_verify(dir,
+                                        (struct ext4_dir_entry *)bh->b_data)) {
+                       ext4_warning(dir->i_sb, "Node failed checksum");
+                       brelse(bh);
+                       *err = ERR_BAD_DX_DIR;
+                       goto fail;
+               }
+               set_buffer_verified(bh);
+
                 if (dx_get_limit(entries) != dx_node_limit (dir)) {
                         ext4_warning(dir->i_sb,
                                      "dx entry: limit != node limit");
@@ -549,6 +809,15 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash,
                 if (!(bh = ext4_bread(NULL, dir, dx_get_block(p->at),
                                       0, &err)))
                         return err; /* Failure */
+
+               if (!buffer_verified(bh) &&
+                   !ext4_dx_csum_verify(dir,
+                                        (struct ext4_dir_entry *)bh->b_data)) {
+                       ext4_warning(dir->i_sb, "Node failed checksum");
+                       return -EIO;
+               }
+               set_buffer_verified(bh);
+
                 p++;
                 brelse(p->bh);
                 p->bh = bh;
@@ -577,6 +846,11 @@ static int htree_dirblock_to_tree(struct file *dir_file,
         if (!(bh = ext4_bread (NULL, dir, block, 0, &err)))
                 return err;
  
+       if (!buffer_verified(bh) &&
+           !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
+               return -EIO;
+       set_buffer_verified(bh);
+
         de = (struct ext4_dir_entry_2 *) bh->b_data;
         top = (struct ext4_dir_entry_2 *) ((char *) de +
                                            dir->i_sb->s_blocksize -
@@ -936,6 +1210,15 @@ restart:
                         brelse(bh);
                         goto next;
                 }
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(dir,
+                               (struct ext4_dir_entry *)bh->b_data)) {
+                       EXT4_ERROR_INODE(dir, "checksumming directory "
+                                        "block %lu", (unsigned long)block);
+                       brelse(bh);
+                       goto next;
+               }
+               set_buffer_verified(bh);
                 i = search_dirblock(bh, dir, d_name,
                             block << EXT4_BLOCK_SIZE_BITS(sb), res_dir);
                 if (i == 1) {
@@ -987,6 +1270,16 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                 if (!(bh = ext4_bread(NULL, dir, block, 0, err)))
                         goto errout;
  
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(dir,
+                               (struct ext4_dir_entry *)bh->b_data)) {
+                       EXT4_ERROR_INODE(dir, "checksumming directory "
+                                        "block %lu", (unsigned long)block);
+                       brelse(bh);
+                       *err = -EIO;
+                       goto errout;
+               }
+               set_buffer_verified(bh);
                 retval = search_dirblock(bh, dir, d_name,
                                          block << EXT4_BLOCK_SIZE_BITS(sb),
                                          res_dir);
@@ -1037,6 +1330,12 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                         EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                         return ERR_PTR(-EIO);
                 }
+               if (unlikely(ino == dir->i_ino)) {
+                       EXT4_ERROR_INODE(dir, "'%.*s' linked to parent dir",
+                                        dentry->d_name.len,
+                                        dentry->d_name.name);
+                       return ERR_PTR(-EIO);
+               }
                 inode = ext4_iget(dir->i_sb, ino);
                 if (inode == ERR_PTR(-ESTALE)) {
                         EXT4_ERROR_INODE(dir,
@@ -1156,8 +1455,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
         char *data1 = (*bh)->b_data, *data2;
         unsigned split, move, size;
         struct ext4_dir_entry_2 *de = NULL, *de2;
+       struct ext4_dir_entry_tail *t;
+       int     csum_size = 0;
         int     err = 0, i;
  
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+
         bh2 = ext4_append (handle, dir, &newblock, &err);
         if (!(bh2)) {
                 brelse(*bh);
@@ -1204,10 +1509,20 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
         /* Fancy dance to stay within two buffers */
         de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
         de = dx_pack_dirents(data1, blocksize);
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+       de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+                                          (char *) de,
                                            blocksize);
-       de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+       de2->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
+                                           (char *) de2,
                                             blocksize);
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(data2, blocksize);
+               initialize_dirent_tail(t, blocksize);
+
+               t = EXT4_DIRENT_TAIL(data1, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
         dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
         dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
  
@@ -1218,10 +1533,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                 de = de2;
         }
         dx_insert_block(frame, hash2 + continued, newblock);
-       err = ext4_handle_dirty_metadata(handle, dir, bh2);
+       err = ext4_handle_dirty_dirent_node(handle, dir, bh2);
         if (err)
                 goto journal_error;
-       err = ext4_handle_dirty_metadata(handle, dir, frame->bh);
+       err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
         if (err)
                 goto journal_error;
         brelse(bh2);
@@ -1258,11 +1573,16 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
         unsigned short  reclen;
         int             nlen, rlen, err;
         char            *top;
+       int             csum_size = 0;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
  
         reclen = EXT4_DIR_REC_LEN(namelen);
         if (!de) {
                 de = (struct ext4_dir_entry_2 *)bh->b_data;
-               top = bh->b_data + blocksize - reclen;
+               top = bh->b_data + (blocksize - csum_size) - reclen;
                 while ((char *) de <= top) {
                         if (ext4_check_dir_entry(dir, NULL, de, bh, offset))
                                 return -EIO;
@@ -1295,11 +1615,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                 de = de1;
         }
         de->file_type = EXT4_FT_UNKNOWN;
-       if (inode) {
-               de->inode = cpu_to_le32(inode->i_ino);
-               ext4_set_de_type(dir->i_sb, de, inode->i_mode);
-       } else
-               de->inode = 0;
+       de->inode = cpu_to_le32(inode->i_ino);
+       ext4_set_de_type(dir->i_sb, de, inode->i_mode);
         de->name_len = namelen;
         memcpy(de->name, name, namelen);
         /*
@@ -1318,7 +1635,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
         dir->i_version++;
         ext4_mark_inode_dirty(handle, dir);
         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_metadata(handle, dir, bh);
+       err = ext4_handle_dirty_dirent_node(handle, dir, bh);
         if (err)
                 ext4_std_error(dir->i_sb, err);
         return 0;
@@ -1339,6 +1656,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
         struct dx_frame frames[2], *frame;
         struct dx_entry *entries;
         struct ext4_dir_entry_2 *de, *de2;
+       struct ext4_dir_entry_tail *t;
         char            *data1, *top;
         unsigned        len;
         int             retval;
@@ -1346,6 +1664,11 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
         struct dx_hash_info hinfo;
         ext4_lblk_t  block;
         struct fake_dirent *fde;
+       int             csum_size = 0;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
  
         blocksize =  dir->i_sb->s_blocksize;
         dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino));
@@ -1366,7 +1689,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                 brelse(bh);
                 return -EIO;
         }
-       len = ((char *) root) + blocksize - (char *) de;
+       len = ((char *) root) + (blocksize - csum_size) - (char *) de;
  
         /* Allocate new block for the 0th block's dirents */
         bh2 = ext4_append(handle, dir, &block, &retval);
@@ -1382,8 +1705,15 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
         top = data1 + len;
         while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
                 de = de2;
-       de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+       de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) -
+                                          (char *) de,
                                            blocksize);
+
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(data1, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
         /* Initialize the root; the dot dirents already exist */
         de = (struct ext4_dir_entry_2 *) (&root->dotdot);
         de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
@@ -1408,8 +1738,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
         frame->bh = bh;
         bh = bh2;
  
-       ext4_handle_dirty_metadata(handle, dir, frame->bh);
-       ext4_handle_dirty_metadata(handle, dir, bh);
+       ext4_handle_dirty_dx_node(handle, dir, frame->bh);
+       ext4_handle_dirty_dirent_node(handle, dir, bh);
  
         de = do_split(handle,dir, &bh, frame, &hinfo, &retval);
         if (!de) {
@@ -1445,11 +1775,17 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
         struct inode *dir = dentry->d_parent->d_inode;
         struct buffer_head *bh;
         struct ext4_dir_entry_2 *de;
+       struct ext4_dir_entry_tail *t;
         struct super_block *sb;
         int     retval;
         int     dx_fallback=0;
         unsigned blocksize;
         ext4_lblk_t block, blocks;
+       int     csum_size = 0;
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
  
         sb = dir->i_sb;
         blocksize = sb->s_blocksize;
@@ -1468,6 +1804,11 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                 bh = ext4_bread(handle, dir, block, 0, &retval);
                 if(!bh)
                         return retval;
+               if (!buffer_verified(bh) &&
+                   !ext4_dirent_csum_verify(dir,
+                               (struct ext4_dir_entry *)bh->b_data))
+                       return -EIO;
+               set_buffer_verified(bh);
                 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
                 if (retval != -ENOSPC) {
                         brelse(bh);
@@ -1484,7 +1825,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                 return retval;
         de = (struct ext4_dir_entry_2 *) bh->b_data;
         de->inode = 0;
-       de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
+       de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize);
+
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(bh->b_data, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
         retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
         brelse(bh);
         if (retval == 0)
@@ -1516,6 +1863,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
         if (!(bh = ext4_bread(handle,dir, dx_get_block(frame->at), 0, &err)))
                 goto cleanup;
  
+       if (!buffer_verified(bh) &&
+           !ext4_dirent_csum_verify(dir, (struct ext4_dir_entry *)bh->b_data))
+               goto journal_error;
+       set_buffer_verified(bh);
+
         BUFFER_TRACE(bh, "get_write_access");
         err = ext4_journal_get_write_access(handle, bh);
         if (err)
@@ -1583,7 +1935,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                         dxtrace(dx_show_index("node", frames[1].entries));
                         dxtrace(dx_show_index("node",
                                ((struct dx_node *) bh2->b_data)->entries));
-                       err = ext4_handle_dirty_metadata(handle, dir, bh2);
+                       err = ext4_handle_dirty_dx_node(handle, dir, bh2);
                         if (err)
                                 goto journal_error;
                         brelse (bh2);
@@ -1609,7 +1961,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                         if (err)
                                 goto journal_error;
                 }
-               err = ext4_handle_dirty_metadata(handle, dir, frames[0].bh);
+               err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh);
                 if (err) {
                         ext4_std_error(inode->i_sb, err);
                         goto cleanup;
@@ -1641,12 +1993,17 @@ static int ext4_delete_entry(handle_t *handle,
  {
         struct ext4_dir_entry_2 *de, *pde;
         unsigned int blocksize = dir->i_sb->s_blocksize;
+       int csum_size = 0;
         int i, err;
  
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+
         i = 0;
         pde = NULL;
         de = (struct ext4_dir_entry_2 *) bh->b_data;
-       while (i < bh->b_size) {
+       while (i < bh->b_size - csum_size) {
                 if (ext4_check_dir_entry(dir, NULL, de, bh, i))
                         return -EIO;
                 if (de == de_del)  {
@@ -1667,7 +2024,7 @@ static int ext4_delete_entry(handle_t *handle,
                                 de->inode = 0;
                         dir->i_version++;
                         BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                       err = ext4_handle_dirty_metadata(handle, dir, bh);
+                       err = ext4_handle_dirty_dirent_node(handle, dir, bh);
                         if (unlikely(err)) {
                                 ext4_std_error(dir->i_sb, err);
                                 return err;
@@ -1809,9 +2166,15 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         struct inode *inode;
         struct buffer_head *dir_block = NULL;
         struct ext4_dir_entry_2 *de;
+       struct ext4_dir_entry_tail *t;
         unsigned int blocksize = dir->i_sb->s_blocksize;
+       int csum_size = 0;
         int err, retries = 0;
  
+       if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               csum_size = sizeof(struct ext4_dir_entry_tail);
+
         if (EXT4_DIR_LINK_MAX(dir))
                 return -EMLINK;
  
@@ -1852,16 +2215,24 @@ retry:
         ext4_set_de_type(dir->i_sb, de, S_IFDIR);
         de = ext4_next_entry(de, blocksize);
         de->inode = cpu_to_le32(dir->i_ino);
-       de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+       de->rec_len = ext4_rec_len_to_disk(blocksize -
+                                          (csum_size + EXT4_DIR_REC_LEN(1)),
                                            blocksize);
         de->name_len = 2;
         strcpy(de->name, "..");
         ext4_set_de_type(dir->i_sb, de, S_IFDIR);
         set_nlink(inode, 2);
+
+       if (csum_size) {
+               t = EXT4_DIRENT_TAIL(dir_block->b_data, blocksize);
+               initialize_dirent_tail(t, blocksize);
+       }
+
         BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata");
-       err = ext4_handle_dirty_metadata(handle, inode, dir_block);
+       err = ext4_handle_dirty_dirent_node(handle, inode, dir_block);
         if (err)
                 goto out_clear_inode;
+       set_buffer_verified(dir_block);
         err = ext4_mark_inode_dirty(handle, inode);
         if (!err)
                 err = ext4_add_entry(handle, dentry, inode);
@@ -1911,6 +2282,14 @@ static int empty_dir(struct inode *inode)
                                      inode->i_ino);
                 return 1;
         }
+       if (!buffer_verified(bh) &&
+           !ext4_dirent_csum_verify(inode,
+                       (struct ext4_dir_entry *)bh->b_data)) {
+               EXT4_ERROR_INODE(inode, "checksum error reading directory "
+                                "lblock 0");
+               return -EIO;
+       }
+       set_buffer_verified(bh);
         de = (struct ext4_dir_entry_2 *) bh->b_data;
         de1 = ext4_next_entry(de, sb->s_blocksize);
         if (le32_to_cpu(de->inode) != inode->i_ino ||
@@ -1942,6 +2321,14 @@ static int empty_dir(struct inode *inode)
                                 offset += sb->s_blocksize;
                                 continue;
                         }
+                       if (!buffer_verified(bh) &&
+                           !ext4_dirent_csum_verify(inode,
+                                       (struct ext4_dir_entry *)bh->b_data)) {
+                               EXT4_ERROR_INODE(inode, "checksum error "
+                                                "reading directory lblock 0");
+                               return -EIO;
+                       }
+                       set_buffer_verified(bh);
                         de = (struct ext4_dir_entry_2 *) bh->b_data;
                 }
                 if (ext4_check_dir_entry(inode, NULL, de, bh, offset)) {
@@ -2010,7 +2397,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
         /* Insert this inode at the head of the on-disk orphan list... */
         NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
         EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
-       err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+       err = ext4_handle_dirty_super_now(handle, sb);
         rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
         if (!err)
                 err = rc;
@@ -2083,7 +2470,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
                 if (err)
                         goto out_brelse;
                 sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
-               err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
+               err = ext4_handle_dirty_super_now(handle, inode->i_sb);
         } else {
                 struct ext4_iloc iloc2;
                 struct inode *i_prev =
@@ -2442,6 +2829,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                 dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
                 if (!dir_bh)
                         goto end_rename;
+               if (!buffer_verified(dir_bh) &&
+                   !ext4_dirent_csum_verify(old_inode,
+                               (struct ext4_dir_entry *)dir_bh->b_data))
+                       goto end_rename;
+               set_buffer_verified(dir_bh);
                 if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
                                 old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
                         goto end_rename;
@@ -2472,7 +2864,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                                         ext4_current_time(new_dir);
                 ext4_mark_inode_dirty(handle, new_dir);
                 BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata");
-               retval = ext4_handle_dirty_metadata(handle, new_dir, new_bh);
+               retval = ext4_handle_dirty_dirent_node(handle, new_dir, new_bh);
                 if (unlikely(retval)) {
                         ext4_std_error(new_dir->i_sb, retval);
                         goto end_rename;
@@ -2526,7 +2918,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                 PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
                                                 cpu_to_le32(new_dir->i_ino);
                 BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
-               retval = ext4_handle_dirty_metadata(handle, old_inode, dir_bh);
+               retval = ext4_handle_dirty_dirent_node(handle, old_inode,
+                                                      dir_bh);
                 if (retval) {
                         ext4_std_error(old_dir->i_sb, retval);
                         goto end_rename;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c

index 59fa0be272516adf6cbbc94384106690bf710c65..7ea6cbb44121952bf0d4f81f914950ab284dba6b 100644 (file)
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -161,6 +161,8 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
         if (flex_gd == NULL)
                 goto out3;
  
+       if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_flex_group_data))
+               goto out2;
         flex_gd->count = flexbg_size;
  
         flex_gd->groups = kmalloc(sizeof(struct ext4_new_group_data) *
@@ -796,7 +798,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
         ext4_kvfree(o_group_desc);
  
         le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
-       err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+       err = ext4_handle_dirty_super_now(handle, sb);
         if (err)
                 ext4_std_error(sb, err);
  
@@ -968,6 +970,8 @@ static void update_backups(struct super_block *sb,
                 goto exit_err;
         }
  
+       ext4_superblock_csum_set(sb, (struct ext4_super_block *)data);
+
         while ((group = ext4_list_backups(sb, &three, &five, &seven)) < last) {
                 struct buffer_head *bh;
  
@@ -1067,6 +1071,54 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
         return err;
  }
  
+static struct buffer_head *ext4_get_bitmap(struct super_block *sb, __u64 block)
+{
+       struct buffer_head *bh = sb_getblk(sb, block);
+       if (!bh)
+               return NULL;
+
+       if (bitmap_uptodate(bh))
+               return bh;
+
+       lock_buffer(bh);
+       if (bh_submit_read(bh) < 0) {
+               unlock_buffer(bh);
+               brelse(bh);
+               return NULL;
+       }
+       unlock_buffer(bh);
+
+       return bh;
+}
+
+static int ext4_set_bitmap_checksums(struct super_block *sb,
+                                    ext4_group_t group,
+                                    struct ext4_group_desc *gdp,
+                                    struct ext4_new_group_data *group_data)
+{
+       struct buffer_head *bh;
+
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 0;
+
+       bh = ext4_get_bitmap(sb, group_data->inode_bitmap);
+       if (!bh)
+               return -EIO;
+       ext4_inode_bitmap_csum_set(sb, group, gdp, bh,
+                                  EXT4_INODES_PER_GROUP(sb) / 8);
+       brelse(bh);
+
+       bh = ext4_get_bitmap(sb, group_data->block_bitmap);
+       if (!bh)
+               return -EIO;
+       ext4_block_bitmap_csum_set(sb, group, gdp, bh,
+                                  EXT4_BLOCKS_PER_GROUP(sb) / 8);
+       brelse(bh);
+
+       return 0;
+}
+
  /*
   * ext4_setup_new_descs() will set up the group descriptor descriptors of a flex bg
   */
@@ -1093,18 +1145,24 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
                  */
                 gdb_bh = sbi->s_group_desc[gdb_num];
                 /* Update group descriptor block for new group */
-               gdp = (struct ext4_group_desc *)((char *)gdb_bh->b_data +
+               gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
                                                  gdb_off * EXT4_DESC_SIZE(sb));
  
                 memset(gdp, 0, EXT4_DESC_SIZE(sb));
                 ext4_block_bitmap_set(sb, gdp, group_data->block_bitmap);
                 ext4_inode_bitmap_set(sb, gdp, group_data->inode_bitmap);
+               err = ext4_set_bitmap_checksums(sb, group, gdp, group_data);
+               if (err) {
+                       ext4_std_error(sb, err);
+                       break;
+               }
+
                 ext4_inode_table_set(sb, gdp, group_data->inode_table);
                 ext4_free_group_clusters_set(sb, gdp,
                                              EXT4_B2C(sbi, group_data->free_blocks_count));
                 ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb));
                 gdp->bg_flags = cpu_to_le16(*bg_flags);
-               gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+               ext4_group_desc_csum_set(sb, group, gdp);
  
                 err = ext4_handle_dirty_metadata(handle, NULL, gdb_bh);
                 if (unlikely(err)) {
@@ -1343,17 +1401,14 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
                            (1 + ext4_bg_num_gdb(sb, group + i) +
                             le16_to_cpu(es->s_reserved_gdt_blocks)) : 0;
                 group_data[i].free_blocks_count = blocks_per_group - overhead;
-               if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                              EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+               if (ext4_has_group_desc_csum(sb))
                         flex_gd->bg_flags[i] = EXT4_BG_BLOCK_UNINIT |
                                                EXT4_BG_INODE_UNINIT;
                 else
                         flex_gd->bg_flags[i] = EXT4_BG_INODE_ZEROED;
         }
  
-       if (last_group == n_group &&
-           EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                      EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+       if (last_group == n_group && ext4_has_group_desc_csum(sb))
                 /* We need to initialize block bitmap of last group. */
                 flex_gd->bg_flags[i - 1] &= ~EXT4_BG_BLOCK_UNINIT;
  
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index 35b5954489eeb88c6c5a29fd76fced5c3472e6f5..eb7aa3e4ef05caf136f24e0565a28e6d1e0a1539 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -112,6 +112,48 @@ static struct file_system_type ext3_fs_type = {
  #define IS_EXT3_SB(sb) (0)
  #endif
  
+static int ext4_verify_csum_type(struct super_block *sb,
+                                struct ext4_super_block *es)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
+}
+
+static __le32 ext4_superblock_csum(struct super_block *sb,
+                                  struct ext4_super_block *es)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       int offset = offsetof(struct ext4_super_block, s_checksum);
+       __u32 csum;
+
+       csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+
+       return cpu_to_le32(csum);
+}
+
+int ext4_superblock_csum_verify(struct super_block *sb,
+                               struct ext4_super_block *es)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return 1;
+
+       return es->s_checksum == ext4_superblock_csum(sb, es);
+}
+
+void ext4_superblock_csum_set(struct super_block *sb,
+                             struct ext4_super_block *es)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       es->s_checksum = ext4_superblock_csum(sb, es);
+}
+
  void *ext4_kvmalloc(size_t size, gfp_t flags)
  {
         void *ret;
@@ -497,6 +539,7 @@ void __ext4_error(struct super_block *sb, const char *function,
         printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: %pV\n",
                sb->s_id, function, line, current->comm, &vaf);
         va_end(args);
+       save_error_info(sb, function, line);
  
         ext4_handle_error(sb);
  }
@@ -905,6 +948,8 @@ static void ext4_put_super(struct super_block *sb)
         unlock_super(sb);
         kobject_put(&sbi->s_kobj);
         wait_for_completion(&sbi->s_kobj_unregister);
+       if (sbi->s_chksum_driver)
+               crypto_free_shash(sbi->s_chksum_driver);
         kfree(sbi->s_blockgroup_lock);
         kfree(sbi);
  }
@@ -1922,43 +1967,69 @@ failed:
         return 0;
  }
  
-__le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
-                           struct ext4_group_desc *gdp)
+static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group,
+                                  struct ext4_group_desc *gdp)
  {
+       int offset;
         __u16 crc = 0;
+       __le32 le_group = cpu_to_le32(block_group);
  
-       if (sbi->s_es->s_feature_ro_compat &
-           cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-               int offset = offsetof(struct ext4_group_desc, bg_checksum);
-               __le32 le_group = cpu_to_le32(block_group);
-
-               crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
-               crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
-               crc = crc16(crc, (__u8 *)gdp, offset);
-               offset += sizeof(gdp->bg_checksum); /* skip checksum */
-               /* for checksum of struct ext4_group_desc do the rest...*/
-               if ((sbi->s_es->s_feature_incompat &
-                    cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
-                   offset < le16_to_cpu(sbi->s_es->s_desc_size))
-                       crc = crc16(crc, (__u8 *)gdp + offset,
-                                   le16_to_cpu(sbi->s_es->s_desc_size) -
-                                       offset);
+       if ((sbi->s_es->s_feature_ro_compat &
+            cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) {
+               /* Use new metadata_csum algorithm */
+               __u16 old_csum;
+               __u32 csum32;
+
+               old_csum = gdp->bg_checksum;
+               gdp->bg_checksum = 0;
+               csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+                                    sizeof(le_group));
+               csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp,
+                                    sbi->s_desc_size);
+               gdp->bg_checksum = old_csum;
+
+               crc = csum32 & 0xFFFF;
+               goto out;
         }
  
+       /* old crc16 code */
+       offset = offsetof(struct ext4_group_desc, bg_checksum);
+
+       crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid));
+       crc = crc16(crc, (__u8 *)&le_group, sizeof(le_group));
+       crc = crc16(crc, (__u8 *)gdp, offset);
+       offset += sizeof(gdp->bg_checksum); /* skip checksum */
+       /* for checksum of struct ext4_group_desc do the rest...*/
+       if ((sbi->s_es->s_feature_incompat &
+            cpu_to_le32(EXT4_FEATURE_INCOMPAT_64BIT)) &&
+           offset < le16_to_cpu(sbi->s_es->s_desc_size))
+               crc = crc16(crc, (__u8 *)gdp + offset,
+                           le16_to_cpu(sbi->s_es->s_desc_size) -
+                               offset);
+
+out:
         return cpu_to_le16(crc);
  }
  
-int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 block_group,
+int ext4_group_desc_csum_verify(struct super_block *sb, __u32 block_group,
                                 struct ext4_group_desc *gdp)
  {
-       if ((sbi->s_es->s_feature_ro_compat &
-            cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) &&
-           (gdp->bg_checksum != ext4_group_desc_csum(sbi, block_group, gdp)))
+       if (ext4_has_group_desc_csum(sb) &&
+           (gdp->bg_checksum != ext4_group_desc_csum(EXT4_SB(sb),
+                                                     block_group, gdp)))
                 return 0;
  
         return 1;
  }
  
+void ext4_group_desc_csum_set(struct super_block *sb, __u32 block_group,
+                             struct ext4_group_desc *gdp)
+{
+       if (!ext4_has_group_desc_csum(sb))
+               return;
+       gdp->bg_checksum = ext4_group_desc_csum(EXT4_SB(sb), block_group, gdp);
+}
+
  /* Called at mount-time, super-block is locked */
  static int ext4_check_descriptors(struct super_block *sb,
                                   ext4_group_t *first_not_zeroed)
@@ -2013,7 +2084,7 @@ static int ext4_check_descriptors(struct super_block *sb,
                         return 0;
                 }
                 ext4_lock_group(sb, i);
-               if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
+               if (!ext4_group_desc_csum_verify(sb, i, gdp)) {
                         ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                                  "Checksum for group %u failed (%u!=%u)",
                                  i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
@@ -2417,6 +2488,23 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
         return count;
  }
  
+static ssize_t trigger_test_error(struct ext4_attr *a,
+                                 struct ext4_sb_info *sbi,
+                                 const char *buf, size_t count)
+{
+       int len = count;
+
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (len && buf[len-1] == '\n')
+               len--;
+
+       if (len)
+               ext4_error(sbi->s_sb, "%.*s", len, buf);
+       return count;
+}
+
  #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
  static struct ext4_attr ext4_attr_##_name = {                  \
         .attr = {.name = __stringify(_name), .mode = _mode },   \
@@ -2447,6 +2535,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
  EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
  EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
  EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
+EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
  
  static struct attribute *ext4_attrs[] = {
         ATTR_LIST(delayed_allocation_blocks),
@@ -2461,6 +2550,7 @@ static struct attribute *ext4_attrs[] = {
         ATTR_LIST(mb_stream_req),
         ATTR_LIST(mb_group_prealloc),
         ATTR_LIST(max_writeback_mb_bump),
+       ATTR_LIST(trigger_fs_error),
         NULL,
  };
  
@@ -2957,6 +3047,44 @@ static void ext4_destroy_lazyinit_thread(void)
         kthread_stop(ext4_lazyinit_task);
  }
  
+static int set_journal_csum_feature_set(struct super_block *sb)
+{
+       int ret = 1;
+       int compat, incompat;
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               /* journal checksum v2 */
+               compat = 0;
+               incompat = JBD2_FEATURE_INCOMPAT_CSUM_V2;
+       } else {
+               /* journal checksum v1 */
+               compat = JBD2_FEATURE_COMPAT_CHECKSUM;
+               incompat = 0;
+       }
+
+       if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+               ret = jbd2_journal_set_features(sbi->s_journal,
+                               compat, 0,
+                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+                               incompat);
+       } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+               ret = jbd2_journal_set_features(sbi->s_journal,
+                               compat, 0,
+                               incompat);
+               jbd2_journal_clear_features(sbi->s_journal, 0, 0,
+                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+       } else {
+               jbd2_journal_clear_features(sbi->s_journal,
+                               JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT |
+                               JBD2_FEATURE_INCOMPAT_CSUM_V2);
+       }
+
+       return ret;
+}
+
  static int ext4_fill_super(struct super_block *sb, void *data, int silent)
  {
         char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -2993,6 +3121,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 goto out_free_orig;
         }
         sb->s_fs_info = sbi;
+       sbi->s_sb = sb;
         sbi->s_mount_opt = 0;
         sbi->s_resuid = make_kuid(&init_user_ns, EXT4_DEF_RESUID);
         sbi->s_resgid = make_kgid(&init_user_ns, EXT4_DEF_RESGID);
@@ -3032,13 +3161,54 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
          * Note: s_es must be initialized as soon as possible because
          *       some ext4 macro-instructions depend on its value
          */
-       es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+       es = (struct ext4_super_block *) (bh->b_data + offset);
         sbi->s_es = es;
         sb->s_magic = le16_to_cpu(es->s_magic);
         if (sb->s_magic != EXT4_SUPER_MAGIC)
                 goto cantfind_ext4;
         sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
  
+       /* Warn if metadata_csum and gdt_csum are both set. */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+           EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
+               ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+                            "redundant flags; please run fsck.");
+
+       /* Check for a known checksum algorithm */
+       if (!ext4_verify_csum_type(sb, es)) {
+               ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+                        "unknown checksum algorithm.");
+               silent = 1;
+               goto cantfind_ext4;
+       }
+
+       /* Load the checksum driver */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                      EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) {
+               sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+               if (IS_ERR(sbi->s_chksum_driver)) {
+                       ext4_msg(sb, KERN_ERR, "Cannot load crc32c driver.");
+                       ret = PTR_ERR(sbi->s_chksum_driver);
+                       sbi->s_chksum_driver = NULL;
+                       goto failed_mount;
+               }
+       }
+
+       /* Check superblock checksum */
+       if (!ext4_superblock_csum_verify(sb, es)) {
+               ext4_msg(sb, KERN_ERR, "VFS: Found ext4 filesystem with "
+                        "invalid superblock checksum.  Run e2fsck?");
+               silent = 1;
+               goto cantfind_ext4;
+       }
+
+       /* Precompute checksum seed for all metadata */
+       if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+                                              sizeof(es->s_uuid));
+
         /* Set defaults before we parse the mount options */
         def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
         set_opt(sb, INIT_INODE_TABLE);
@@ -3200,7 +3370,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                "Can't read superblock on 2nd try");
                         goto failed_mount;
                 }
-               es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
+               es = (struct ext4_super_block *)(bh->b_data + offset);
                 sbi->s_es = es;
                 if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
                         ext4_msg(sb, KERN_ERR,
@@ -3392,6 +3562,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                           GFP_KERNEL);
         if (sbi->s_group_desc == NULL) {
                 ext4_msg(sb, KERN_ERR, "not enough memory");
+               ret = -ENOMEM;
                 goto failed_mount;
         }
  
@@ -3449,6 +3620,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         }
         if (err) {
                 ext4_msg(sb, KERN_ERR, "insufficient memory");
+               ret = err;
                 goto failed_mount3;
         }
  
@@ -3506,26 +3678,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 goto no_journal;
         }
  
-       if (ext4_blocks_count(es) > 0xffffffffULL &&
+       if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT) &&
             !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                        JBD2_FEATURE_INCOMPAT_64BIT)) {
                 ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                 goto failed_mount_wq;
         }
  
-       if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-               jbd2_journal_set_features(sbi->s_journal,
-                               JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-       } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
-               jbd2_journal_set_features(sbi->s_journal,
-                               JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
-               jbd2_journal_clear_features(sbi->s_journal, 0, 0,
-                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-       } else {
-               jbd2_journal_clear_features(sbi->s_journal,
-                               JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-                               JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+       if (!set_journal_csum_feature_set(sb)) {
+               ext4_msg(sb, KERN_ERR, "Failed to set journal checksum "
+                        "feature set");
+               goto failed_mount_wq;
         }
  
         /* We have now updated the journal if required, so we can
@@ -3606,7 +3769,8 @@ no_journal:
                 goto failed_mount4;
         }
  
-       ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY);
+       if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
+               sb->s_flags |= MS_RDONLY;
  
         /* determine the minimum size of new large inodes, if present */
         if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
@@ -3641,7 +3805,7 @@ no_journal:
         }
  
         ext4_ext_init(sb);
-       err = ext4_mb_init(sb, needs_recovery);
+       err = ext4_mb_init(sb);
         if (err) {
                 ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)",
                          err);
@@ -3724,6 +3888,8 @@ failed_mount2:
                 brelse(sbi->s_group_desc[i]);
         ext4_kvfree(sbi->s_group_desc);
  failed_mount:
+       if (sbi->s_chksum_driver)
+               crypto_free_shash(sbi->s_chksum_driver);
         if (sbi->s_proc) {
                 remove_proc_entry("options", sbi->s_proc);
                 remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -3847,7 +4013,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
                 goto out_bdev;
         }
  
-       es = (struct ext4_super_block *) (((char *)bh->b_data) + offset);
+       es = (struct ext4_super_block *) (bh->b_data + offset);
         if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
             !(le32_to_cpu(es->s_feature_incompat) &
               EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
@@ -4039,6 +4205,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                                 &EXT4_SB(sb)->s_freeinodes_counter));
         sb->s_dirt = 0;
         BUFFER_TRACE(sbh, "marking dirty");
+       ext4_superblock_csum_set(sb, es);
         mark_buffer_dirty(sbh);
         if (sync) {
                 error = sync_dirty_buffer(sbh);
@@ -4333,7 +4500,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                 struct ext4_group_desc *gdp =
                                         ext4_get_group_desc(sb, g, NULL);
  
-                               if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
+                               if (!ext4_group_desc_csum_verify(sb, g, gdp)) {
                                         ext4_msg(sb, KERN_ERR,
                "ext4_remount: Checksum for group %u failed (%u!=%u)",
                 g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c

index e88748e55c0f246e90ca21c2094303719f83df07..e56c9ed7d6e30d523b7f8e4b638f9190427cf50d 100644 (file)
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -122,6 +122,58 @@ const struct xattr_handler *ext4_xattr_handlers[] = {
         NULL
  };
  
+static __le32 ext4_xattr_block_csum(struct inode *inode,
+                                   sector_t block_nr,
+                                   struct ext4_xattr_header *hdr)
+{
+       struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+       struct ext4_inode_info *ei = EXT4_I(inode);
+       __u32 csum, old;
+
+       old = hdr->h_checksum;
+       hdr->h_checksum = 0;
+       if (le32_to_cpu(hdr->h_refcount) != 1) {
+               block_nr = cpu_to_le64(block_nr);
+               csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&block_nr,
+                                  sizeof(block_nr));
+       } else
+               csum = ei->i_csum_seed;
+       csum = ext4_chksum(sbi, csum, (__u8 *)hdr,
+                          EXT4_BLOCK_SIZE(inode->i_sb));
+       hdr->h_checksum = old;
+       return cpu_to_le32(csum);
+}
+
+static int ext4_xattr_block_csum_verify(struct inode *inode,
+                                       sector_t block_nr,
+                                       struct ext4_xattr_header *hdr)
+{
+       if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
+           (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr)))
+               return 0;
+       return 1;
+}
+
+static void ext4_xattr_block_csum_set(struct inode *inode,
+                                     sector_t block_nr,
+                                     struct ext4_xattr_header *hdr)
+{
+       if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+               EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))
+               return;
+
+       hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr);
+}
+
+static inline int ext4_handle_dirty_xattr_block(handle_t *handle,
+                                               struct inode *inode,
+                                               struct buffer_head *bh)
+{
+       ext4_xattr_block_csum_set(inode, bh->b_blocknr, BHDR(bh));
+       return ext4_handle_dirty_metadata(handle, inode, bh);
+}
+
  static inline const struct xattr_handler *
  ext4_xattr_handler(int name_index)
  {
@@ -156,12 +208,22 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end)
  }
  
  static inline int
-ext4_xattr_check_block(struct buffer_head *bh)
+ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh)
  {
+       int error;
+
+       if (buffer_verified(bh))
+               return 0;
+
         if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
             BHDR(bh)->h_blocks != cpu_to_le32(1))
                 return -EIO;
-       return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+       if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh)))
+               return -EIO;
+       error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
+       if (!error)
+               set_buffer_verified(bh);
+       return error;
  }
  
  static inline int
@@ -224,7 +286,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                 goto cleanup;
         ea_bdebug(bh, "b_count=%d, refcount=%d",
                 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-       if (ext4_xattr_check_block(bh)) {
+       if (ext4_xattr_check_block(inode, bh)) {
  bad_block:
                 EXT4_ERROR_INODE(inode, "bad block %llu",
                                  EXT4_I(inode)->i_file_acl);
@@ -369,7 +431,7 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
                 goto cleanup;
         ea_bdebug(bh, "b_count=%d, refcount=%d",
                 atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
-       if (ext4_xattr_check_block(bh)) {
+       if (ext4_xattr_check_block(inode, bh)) {
                 EXT4_ERROR_INODE(inode, "bad block %llu",
                                  EXT4_I(inode)->i_file_acl);
                 error = -EIO;
@@ -492,7 +554,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
                 if (ce)
                         mb_cache_entry_release(ce);
                 unlock_buffer(bh);
-               error = ext4_handle_dirty_metadata(handle, inode, bh);
+               error = ext4_handle_dirty_xattr_block(handle, inode, bh);
                 if (IS_SYNC(inode))
                         ext4_handle_sync(handle);
                 dquot_free_block(inode, 1);
@@ -662,7 +724,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                 ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
                         atomic_read(&(bs->bh->b_count)),
                         le32_to_cpu(BHDR(bs->bh)->h_refcount));
-               if (ext4_xattr_check_block(bs->bh)) {
+               if (ext4_xattr_check_block(inode, bs->bh)) {
                         EXT4_ERROR_INODE(inode, "bad block %llu",
                                          EXT4_I(inode)->i_file_acl);
                         error = -EIO;
@@ -725,9 +787,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
                         if (error == -EIO)
                                 goto bad_block;
                         if (!error)
-                               error = ext4_handle_dirty_metadata(handle,
-                                                                  inode,
-                                                                  bs->bh);
+                               error = ext4_handle_dirty_xattr_block(handle,
+                                                                     inode,
+                                                                     bs->bh);
                         if (error)
                                 goto cleanup;
                         goto inserted;
@@ -796,9 +858,9 @@ inserted:
                                 ea_bdebug(new_bh, "reusing; refcount now=%d",
                                         le32_to_cpu(BHDR(new_bh)->h_refcount));
                                 unlock_buffer(new_bh);
-                               error = ext4_handle_dirty_metadata(handle,
-                                                                  inode,
-                                                                  new_bh);
+                               error = ext4_handle_dirty_xattr_block(handle,
+                                                                     inode,
+                                                                     new_bh);
                                 if (error)
                                         goto cleanup_dquot;
                         }
@@ -855,8 +917,8 @@ getblk_failed:
                         set_buffer_uptodate(new_bh);
                         unlock_buffer(new_bh);
                         ext4_xattr_cache_insert(new_bh);
-                       error = ext4_handle_dirty_metadata(handle,
-                                                          inode, new_bh);
+                       error = ext4_handle_dirty_xattr_block(handle,
+                                                             inode, new_bh);
                         if (error)
                                 goto cleanup;
                 }
@@ -1193,7 +1255,7 @@ retry:
                 error = -EIO;
                 if (!bh)
                         goto cleanup;
-               if (ext4_xattr_check_block(bh)) {
+               if (ext4_xattr_check_block(inode, bh)) {
                         EXT4_ERROR_INODE(inode, "bad block %llu",
                                          EXT4_I(inode)->i_file_acl);
                         error = -EIO;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h

index 25b7387ff183f880cdb9ccaf2529ca8c0f218a7b..91f31ca7d9af9df24a965c64bb0271c43a4d4b09 100644 (file)
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -27,7 +27,9 @@ struct ext4_xattr_header {
         __le32  h_refcount;     /* reference count */
         __le32  h_blocks;       /* number of disk blocks used */
         __le32  h_hash;         /* hash value of all attributes */
-       __u32   h_reserved[4];  /* zero right now */
+       __le32  h_checksum;     /* crc32c(uuid+id+xattrblock) */
+                               /* id = inum if refcount=1, blknum otherwise */
+       __u32   h_reserved[3];  /* zero right now */
  };
  
  struct ext4_xattr_ibody_header {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c

index c2973ea5df9ab57ccd6ee6a44dc2c13ebb22ab35..a3d81ebf6d864a8c2189147e5771c435473b2e42 100644 (file)
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -735,10 +735,9 @@ static struct dentry *fat_fh_to_dentry(struct super_block *sb,
  }
  
  static int
-fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
+fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent)
  {
         int len = *lenp;
-       struct inode *inode =  de->d_inode;
         u32 ipos_h, ipos_m, ipos_l;
  
         if (len < 5) {
@@ -754,9 +753,9 @@ fat_encode_fh(struct dentry *de, __u32 *fh, int *lenp, int connectable)
         fh[1] = inode->i_generation;
         fh[2] = ipos_h;
         fh[3] = ipos_m | MSDOS_I(inode)->i_logstart;
-       spin_lock(&de->d_lock);
-       fh[4] = ipos_l | MSDOS_I(de->d_parent->d_inode)->i_logstart;
-       spin_unlock(&de->d_lock);
+       fh[4] = ipos_l;
+       if (parent)
+               fh[4] |= MSDOS_I(parent)->i_logstart;
         return 3;
  }
  
diff --git a/fs/fcntl.c b/fs/fcntl.c

index d078b75572a75eb9117092ee5bb752c84e1b38b8..81b70e665bf000412f73aa300890a53823db36f0 100644 (file)
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -442,28 +442,24 @@ static int check_fcntl_cmd(unsigned cmd)
  SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
  {      
         struct file *filp;
+       int fput_needed;
         long err = -EBADF;
  
-       filp = fget_raw(fd);
+       filp = fget_raw_light(fd, &fput_needed);
         if (!filp)
                 goto out;
  
         if (unlikely(filp->f_mode & FMODE_PATH)) {
-               if (!check_fcntl_cmd(cmd)) {
-                       fput(filp);
-                       goto out;
-               }
+               if (!check_fcntl_cmd(cmd))
+                       goto out1;
         }
  
         err = security_file_fcntl(filp, cmd, arg);
-       if (err) {
-               fput(filp);
-               return err;
-       }
+       if (!err)
+               err = do_fcntl(fd, cmd, arg, filp);
  
-       err = do_fcntl(fd, cmd, arg, filp);
-
-       fput(filp);
+out1:
+       fput_light(filp, fput_needed);
  out:
         return err;
  }
@@ -473,26 +469,21 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                 unsigned long, arg)
  {      
         struct file * filp;
-       long err;
+       long err = -EBADF;
+       int fput_needed;
  
-       err = -EBADF;
-       filp = fget_raw(fd);
+       filp = fget_raw_light(fd, &fput_needed);
         if (!filp)
                 goto out;
  
         if (unlikely(filp->f_mode & FMODE_PATH)) {
-               if (!check_fcntl_cmd(cmd)) {
-                       fput(filp);
-                       goto out;
-               }
+               if (!check_fcntl_cmd(cmd))
+                       goto out1;
         }
  
         err = security_file_fcntl(filp, cmd, arg);
-       if (err) {
-               fput(filp);
-               return err;
-       }
-       err = -EBADF;
+       if (err)
+               goto out1;
         
         switch (cmd) {
                 case F_GETLK64:
@@ -507,7 +498,8 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd,
                         err = do_fcntl(fd, cmd, arg, filp);
                         break;
         }
-       fput(filp);
+out1:
+       fput_light(filp, fput_needed);
  out:
         return err;
  }
diff --git a/fs/file_table.c b/fs/file_table.c

index 70f2a0fd6aec62b28724d46e356dc0ff871f88b8..a305d9e2d1b2aac05dcd456bdd23885652272439 100644 (file)
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -34,7 +34,6 @@ struct files_stat_struct files_stat = {
         .max_files = NR_FILE
  };
  
-DECLARE_LGLOCK(files_lglock);
  DEFINE_LGLOCK(files_lglock);
  
  /* SLAB cache for file structures */
@@ -421,9 +420,9 @@ static inline void __file_sb_list_add(struct file *file, struct super_block *sb)
   */
  void file_sb_list_add(struct file *file, struct super_block *sb)
  {
-       lg_local_lock(files_lglock);
+       lg_local_lock(&files_lglock);
         __file_sb_list_add(file, sb);
-       lg_local_unlock(files_lglock);
+       lg_local_unlock(&files_lglock);
  }
  
  /**
@@ -436,9 +435,9 @@ void file_sb_list_add(struct file *file, struct super_block *sb)
  void file_sb_list_del(struct file *file)
  {
         if (!list_empty(&file->f_u.fu_list)) {
-               lg_local_lock_cpu(files_lglock, file_list_cpu(file));
+               lg_local_lock_cpu(&files_lglock, file_list_cpu(file));
                 list_del_init(&file->f_u.fu_list);
-               lg_local_unlock_cpu(files_lglock, file_list_cpu(file));
+               lg_local_unlock_cpu(&files_lglock, file_list_cpu(file));
         }
  }
  
@@ -485,7 +484,7 @@ void mark_files_ro(struct super_block *sb)
         struct file *f;
  
  retry:
-       lg_global_lock(files_lglock);
+       lg_global_lock(&files_lglock);
         do_file_list_for_each_entry(sb, f) {
                 struct vfsmount *mnt;
                 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
@@ -502,12 +501,12 @@ retry:
                 file_release_write(f);
                 mnt = mntget(f->f_path.mnt);
                 /* This can sleep, so we can't hold the spinlock. */
-               lg_global_unlock(files_lglock);
+               lg_global_unlock(&files_lglock);
                 mnt_drop_write(mnt);
                 mntput(mnt);
                 goto retry;
         } while_file_list_for_each_entry;
-       lg_global_unlock(files_lglock);
+       lg_global_unlock(&files_lglock);
  }
  
  void __init files_init(unsigned long mempages)
@@ -525,6 +524,6 @@ void __init files_init(unsigned long mempages)
         n = (mempages * (PAGE_SIZE / 1024)) / 10;
         files_stat.max_files = max_t(unsigned long, n, NR_FILE);
         files_defer_init();
-       lg_lock_init(files_lglock);
+       lg_lock_init(&files_lglock, "files_lglock");
         percpu_counter_init(&nr_files, 0);
  } 
diff --git a/fs/fuse/file.c b/fs/fuse/file.c

index 504e61b7fd7515f8aafe7e3b9edd2c9fa42fd91d..9562109d3a879b3dab50ee27d989f3ae89c8b833 100644 (file)
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -962,7 +962,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
         if (err)
                 goto out;
  
-       file_update_time(file);
+       err = file_update_time(file);
+       if (err)
+               goto out;
  
         if (file->f_flags & O_DIRECT) {
                 written = generic_file_direct_write(iocb, iov, &nr_segs,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c

index 56f6dcf307684287bad491b8711fa3a4ef4f0633..42678a33b7bb6297ced300f7fbb61696d37628c9 100644 (file)
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -627,12 +627,10 @@ static struct dentry *fuse_get_dentry(struct super_block *sb,
         return ERR_PTR(err);
  }
  
-static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
-                          int connectable)
+static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+                          struct inode *parent)
  {
-       struct inode *inode = dentry->d_inode;
-       bool encode_parent = connectable && !S_ISDIR(inode->i_mode);
-       int len = encode_parent ? 6 : 3;
+       int len = parent ? 6 : 3;
         u64 nodeid;
         u32 generation;
  
@@ -648,14 +646,9 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
         fh[1] = (u32)(nodeid & 0xffffffff);
         fh[2] = generation;
  
-       if (encode_parent) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                 nodeid = get_fuse_inode(parent)->nodeid;
                 generation = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
  
                 fh[3] = (u32)(nodeid >> 32);
                 fh[4] = (u32)(nodeid & 0xffffffff);
@@ -663,7 +656,7 @@ static int fuse_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
         }
  
         *max_len = len;
-       return encode_parent ? 0x82 : 0x81;
+       return parent ? 0x82 : 0x81;
  }
  
  static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c

index 70ba891654f8ce3582c456e208feda6d56e90a1a..e8ed6d4a6181132ff47960dc118cd6fb60c1b81c 100644 (file)
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -28,15 +28,14 @@
  #define GFS2_LARGE_FH_SIZE 8
  #define GFS2_OLD_FH_SIZE 10
  
-static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
-                         int connectable)
+static int gfs2_encode_fh(struct inode *inode, __u32 *p, int *len,
+                         struct inode *parent)
  {
         __be32 *fh = (__force __be32 *)p;
-       struct inode *inode = dentry->d_inode;
         struct super_block *sb = inode->i_sb;
         struct gfs2_inode *ip = GFS2_I(inode);
  
-       if (connectable && (*len < GFS2_LARGE_FH_SIZE)) {
+       if (parent && (*len < GFS2_LARGE_FH_SIZE)) {
                 *len = GFS2_LARGE_FH_SIZE;
                 return 255;
         } else if (*len < GFS2_SMALL_FH_SIZE) {
@@ -50,14 +49,10 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
         fh[3] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
         *len = GFS2_SMALL_FH_SIZE;
  
-       if (!connectable || inode == sb->s_root->d_inode)
+       if (!parent || inode == sb->s_root->d_inode)
                 return *len;
  
-       spin_lock(&dentry->d_lock);
-       inode = dentry->d_parent->d_inode;
-       ip = GFS2_I(inode);
-       igrab(inode);
-       spin_unlock(&dentry->d_lock);
+       ip = GFS2_I(parent);
  
         fh[4] = cpu_to_be32(ip->i_no_formal_ino >> 32);
         fh[5] = cpu_to_be32(ip->i_no_formal_ino & 0xFFFFFFFF);
@@ -65,8 +60,6 @@ static int gfs2_encode_fh(struct dentry *dentry, __u32 *p, int *len,
         fh[7] = cpu_to_be32(ip->i_no_addr & 0xFFFFFFFF);
         *len = GFS2_LARGE_FH_SIZE;
  
-       iput(inode);
-
         return *len;
  }
  
diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c

index 7a5eb2c718c854206d6db419abe4ce7bc61d12c7..cdb84a8380682b5f341138cb6f75e2754434e073 100644 (file)
--- a/fs/hpfs/alloc.c
+++ b/fs/hpfs/alloc.c
@@ -16,9 +16,9 @@
  static int chk_if_allocated(struct super_block *s, secno sec, char *msg)
  {
         struct quad_buffer_head qbh;
-       u32 *bmp;
+       __le32 *bmp;
         if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "chk"))) goto fail;
-       if ((cpu_to_le32(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
+       if ((le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) >> (sec & 0x1f)) & 1) {
                 hpfs_error(s, "sector '%s' - %08x not allocated in bitmap", msg, sec);
                 goto fail1;
         }
@@ -62,7 +62,7 @@ int hpfs_chk_sectors(struct super_block *s, secno start, int len, char *msg)
  static secno alloc_in_bmp(struct super_block *s, secno near, unsigned n, unsigned forward)
  {
         struct quad_buffer_head qbh;
-       unsigned *bmp;
+       __le32 *bmp;
         unsigned bs = near & ~0x3fff;
         unsigned nr = (near & 0x3fff) & ~(n - 1);
         /*unsigned mnr;*/
@@ -236,7 +236,7 @@ static secno alloc_in_dirband(struct super_block *s, secno near)
  int hpfs_alloc_if_possible(struct super_block *s, secno sec)
  {
         struct quad_buffer_head qbh;
-       u32 *bmp;
+       __le32 *bmp;
         if (!(bmp = hpfs_map_bitmap(s, sec >> 14, &qbh, "aip"))) goto end;
         if (le32_to_cpu(bmp[(sec & 0x3fff) >> 5]) & (1 << (sec & 0x1f))) {
                 bmp[(sec & 0x3fff) >> 5] &= cpu_to_le32(~(1 << (sec & 0x1f)));
@@ -254,7 +254,7 @@ int hpfs_alloc_if_possible(struct super_block *s, secno sec)
  void hpfs_free_sectors(struct super_block *s, secno sec, unsigned n)
  {
         struct quad_buffer_head qbh;
-       u32 *bmp;
+       __le32 *bmp;
         struct hpfs_sb_info *sbi = hpfs_sb(s);
         /*printk("2 - ");*/
         if (!n) return;
@@ -299,7 +299,7 @@ int hpfs_check_free_dnodes(struct super_block *s, int n)
         int n_bmps = (hpfs_sb(s)->sb_fs_size + 0x4000 - 1) >> 14;
         int b = hpfs_sb(s)->sb_c_bitmap & 0x0fffffff;
         int i, j;
-       u32 *bmp;
+       __le32 *bmp;
         struct quad_buffer_head qbh;
         if ((bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
                 for (j = 0; j < 512; j++) {
@@ -351,7 +351,7 @@ void hpfs_free_dnode(struct super_block *s, dnode_secno dno)
                 hpfs_free_sectors(s, dno, 4);
         } else {
                 struct quad_buffer_head qbh;
-               u32 *bmp;
+               __le32 *bmp;
                 unsigned ssec = (dno - hpfs_sb(s)->sb_dirband_start) / 4;
                 if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) {
                         return;
diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c

index 08b503e8ed29ec610a098cb9658e1a2ecaa1779c..4bae4a4a60b1936eba70d17d18e7d4a016ed54b9 100644 (file)
--- a/fs/hpfs/anode.c
+++ b/fs/hpfs/anode.c
@@ -20,7 +20,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode,
         int c1, c2 = 0;
         go_down:
         if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_bplus_lookup")) return -1;
-       if (btree->internal) {
+       if (bp_internal(btree)) {
                 for (i = 0; i < btree->n_used_nodes; i++)
                         if (le32_to_cpu(btree->u.internal[i].file_secno) > sec) {
                                 a = le32_to_cpu(btree->u.internal[i].down);
@@ -82,7 +82,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                 brelse(bh);
                 return -1;
         }
-       if (btree->internal) {
+       if (bp_internal(btree)) {
                 a = le32_to_cpu(btree->u.internal[n].down);
                 btree->u.internal[n].file_secno = cpu_to_le32(-1);
                 mark_buffer_dirty(bh);
@@ -129,12 +129,12 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                 }
                 if (a == node && fnod) {
                         anode->up = cpu_to_le32(node);
-                       anode->btree.fnode_parent = 1;
+                       anode->btree.flags |= BP_fnode_parent;
                         anode->btree.n_used_nodes = btree->n_used_nodes;
                         anode->btree.first_free = btree->first_free;
                         anode->btree.n_free_nodes = 40 - anode->btree.n_used_nodes;
                         memcpy(&anode->u, &btree->u, btree->n_used_nodes * 12);
-                       btree->internal = 1;
+                       btree->flags |= BP_internal;
                         btree->n_free_nodes = 11;
                         btree->n_used_nodes = 1;
                         btree->first_free = cpu_to_le16((char *)&(btree->u.internal[1]) - (char *)btree);
@@ -184,7 +184,10 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                         hpfs_free_sectors(s, ra, 1);
                         if ((anode = hpfs_map_anode(s, na, &bh))) {
                                 anode->up = cpu_to_le32(up);
-                               anode->btree.fnode_parent = up == node && fnod;
+                               if (up == node && fnod)
+                                       anode->btree.flags |= BP_fnode_parent;
+                               else
+                                       anode->btree.flags &= ~BP_fnode_parent;
                                 mark_buffer_dirty(bh);
                                 brelse(bh);
                         }
@@ -198,7 +201,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
                 if ((new_anode = hpfs_alloc_anode(s, a, &na, &bh))) {
                         anode = new_anode;
                         /*anode->up = cpu_to_le32(up != -1 ? up : ra);*/
-                       anode->btree.internal = 1;
+                       anode->btree.flags |= BP_internal;
                         anode->btree.n_used_nodes = 1;
                         anode->btree.n_free_nodes = 59;
                         anode->btree.first_free = cpu_to_le16(16);
@@ -215,7 +218,8 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
         }
         if ((anode = hpfs_map_anode(s, na, &bh))) {
                 anode->up = cpu_to_le32(node);
-               if (fnod) anode->btree.fnode_parent = 1;
+               if (fnod)
+                       anode->btree.flags |= BP_fnode_parent;
                 mark_buffer_dirty(bh);
                 brelse(bh);
         }
@@ -234,18 +238,19 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi
         }
         ranode->up = cpu_to_le32(node);
         memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free));
-       if (fnod) ranode->btree.fnode_parent = 1;
-       ranode->btree.n_free_nodes = (ranode->btree.internal ? 60 : 40) - ranode->btree.n_used_nodes;
-       if (ranode->btree.internal) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
+       if (fnod)
+               ranode->btree.flags |= BP_fnode_parent;
+       ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes;
+       if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) {
                 struct anode *unode;
                 if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) {
                         unode->up = cpu_to_le32(ra);
-                       unode->btree.fnode_parent = 0;
+                       unode->btree.flags &= ~BP_fnode_parent;
                         mark_buffer_dirty(bh1);
                         brelse(bh1);
                 }
         }
-       btree->internal = 1;
+       btree->flags |= BP_internal;
         btree->n_free_nodes = fnod ? 10 : 58;
         btree->n_used_nodes = 2;
         btree->first_free = cpu_to_le16((char *)&btree->u.internal[2] - (char *)btree);
@@ -278,7 +283,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree)
         int d1, d2;
         go_down:
         d2 = 0;
-       while (btree1->internal) {
+       while (bp_internal(btree1)) {
                 ano = le32_to_cpu(btree1->u.internal[pos].down);
                 if (level) brelse(bh);
                 if (hpfs_sb(s)->sb_chk)
@@ -412,13 +417,13 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs)
                         btree->n_free_nodes = 8;
                         btree->n_used_nodes = 0;
                         btree->first_free = cpu_to_le16(8);
-                       btree->internal = 0;
+                       btree->flags &= ~BP_internal;
                         mark_buffer_dirty(bh);
                 } else hpfs_free_sectors(s, f, 1);
                 brelse(bh);
                 return;
         }
-       while (btree->internal) {
+       while (bp_internal(btree)) {
                 nodes = btree->n_used_nodes + btree->n_free_nodes;
                 for (i = 0; i < btree->n_used_nodes; i++)
                         if (le32_to_cpu(btree->u.internal[i].file_secno) >= secs) goto f;
@@ -479,13 +484,13 @@ void hpfs_remove_fnode(struct super_block *s, fnode_secno fno)
         struct extended_attribute *ea;
         struct extended_attribute *ea_end;
         if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return;
-       if (!fnode->dirflag) hpfs_remove_btree(s, &fnode->btree);
+       if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree);
         else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno));
         ea_end = fnode_end_ea(fnode);
         for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
-               if (ea->indirect)
-                       hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
-       hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l));
+               if (ea_indirect(ea))
+                       hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
+       hpfs_ea_ext_remove(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l));
         brelse(bh);
         hpfs_free_sectors(s, fno, 1);
  }
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c

index 2fa0089a02a8ec2934cda55cbbae18e50c34a4ea..b8472f803f4e54ea5039b85ac36cfdf33a48925b 100644 (file)
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -87,7 +87,7 @@ static int hpfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
                         ret = -EIOERROR;
                         goto out;
                 }
-               if (!fno->dirflag) {
+               if (!fnode_is_dir(fno)) {
                         e = 1;
                         hpfs_error(inode->i_sb, "not a directory, fnode %08lx",
                                         (unsigned long)inode->i_ino);
diff --git a/fs/hpfs/dnode.c b/fs/hpfs/dnode.c

index 1e0e2ac30fd3be93f8e5b7a97618f19a52220ec4..3228c524ebe56f948d8896cec23ca6b1284f6303 100644 (file)
--- a/fs/hpfs/dnode.c
+++ b/fs/hpfs/dnode.c
@@ -153,7 +153,7 @@ static void set_last_pointer(struct super_block *s, struct dnode *d, dnode_secno
                 }
                 de->length = cpu_to_le16(36);
                 de->down = 1;
-               *(dnode_secno *)((char *)de + 32) = cpu_to_le32(ptr);
+               *(__le32 *)((char *)de + 32) = cpu_to_le32(ptr);
         }
  }
  
@@ -177,7 +177,7 @@ struct hpfs_dirent *hpfs_add_de(struct super_block *s, struct dnode *d,
         memmove((char *)de + d_size, de, (char *)de_end - (char *)de);
         memset(de, 0, d_size);
         if (down_ptr) {
-               *(dnode_secno *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
+               *(__le32 *)((char *)de + d_size - 4) = cpu_to_le32(down_ptr);
                 de->down = 1;
         }
         de->length = cpu_to_le16(d_size);
@@ -656,7 +656,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
                                 del->down = 0;
                                 d1->first_free = cpu_to_le32(le32_to_cpu(d1->first_free) - 4);
                         } else if (down)
-                               *(dnode_secno *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
+                               *(__le32 *) ((void *) del + le16_to_cpu(del->length) - 4) = cpu_to_le32(down);
                 } else goto endm;
                 if (!(de_cp = kmalloc(le16_to_cpu(de_prev->length), GFP_NOFS))) {
                         printk("HPFS: out of memory for dtree balancing\n");
@@ -672,7 +672,7 @@ static void delete_empty_dnode(struct inode *i, dnode_secno dno)
                         de_prev->down = 1;
                         dnode->first_free = cpu_to_le32(le32_to_cpu(dnode->first_free) + 4);
                 }
-               *(dnode_secno *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
+               *(__le32 *) ((void *) de_prev + le16_to_cpu(de_prev->length) - 4) = cpu_to_le32(ndown);
                 hpfs_mark_4buffers_dirty(&qbh);
                 hpfs_brelse4(&qbh);
                 for_all_poss(i, hpfs_pos_subst, ((loff_t)up << 4) | (p - 1), 4);
@@ -1015,7 +1015,7 @@ struct hpfs_dirent *map_fnode_dirent(struct super_block *s, fnode_secno fno,
                 kfree(name2);
                 return NULL;
         }       
-       if (!upf->dirflag) {
+       if (!fnode_is_dir(upf)) {
                 brelse(bh);
                 hpfs_error(s, "fnode %08x has non-directory parent %08x", fno, le32_to_cpu(f->up));
                 kfree(name2);
diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c

index d8b84d113c891bbcfd8416d3f35153983b0549a7..bcaafcd2666ac275d02c2f054023cc537ebd7644 100644 (file)
--- a/fs/hpfs/ea.c
+++ b/fs/hpfs/ea.c
@@ -23,15 +23,15 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len)
                         return;
                 }
                 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
-               if (ea->indirect) {
+               if (ea_indirect(ea)) {
                         if (ea_valuelen(ea) != 8) {
-                               hpfs_error(s, "ea->indirect set while ea->valuelen!=8, %s %08x, pos %08x",
+                               hpfs_error(s, "ea_indirect(ea) set while ea->valuelen!=8, %s %08x, pos %08x",
                                         ano ? "anode" : "sectors", a, pos);
                                 return;
                         }
                         if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 9, ex+4))
                                 return;
-                       hpfs_ea_remove(s, ea_sec(ea), ea->anode, ea_len(ea));
+                       hpfs_ea_remove(s, ea_sec(ea), ea_in_anode(ea), ea_len(ea));
                 }
                 pos += ea->namelen + ea_valuelen(ea) + 5;
         }
@@ -81,7 +81,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
         struct extended_attribute *ea_end = fnode_end_ea(fnode);
         for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
                 if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
+                       if (ea_indirect(ea))
                                 goto indirect;
                         if (ea_valuelen(ea) >= size)
                                 return -EINVAL;
@@ -91,7 +91,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
                 }
         a = le32_to_cpu(fnode->ea_secno);
         len = le32_to_cpu(fnode->ea_size_l);
-       ano = fnode->ea_anode;
+       ano = fnode_in_anode(fnode);
         pos = 0;
         while (pos < len) {
                 ea = (struct extended_attribute *)ex;
@@ -101,10 +101,10 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
                         return -EIO;
                 }
                 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return -EIO;
-               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
                         return -EIO;
                 if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
+                       if (ea_indirect(ea))
                                 goto indirect;
                         if (ea_valuelen(ea) >= size)
                                 return -EINVAL;
@@ -119,7 +119,7 @@ int hpfs_read_ea(struct super_block *s, struct fnode *fnode, char *key,
  indirect:
         if (ea_len(ea) >= size)
                 return -EINVAL;
-       if (hpfs_ea_read(s, ea_sec(ea), ea->anode, 0, ea_len(ea), buf))
+       if (hpfs_ea_read(s, ea_sec(ea), ea_in_anode(ea), 0, ea_len(ea), buf))
                 return -EIO;
         buf[ea_len(ea)] = 0;
         return 0;
@@ -136,8 +136,8 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
         struct extended_attribute *ea_end = fnode_end_ea(fnode);
         for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
                 if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
-                               return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+                       if (ea_indirect(ea))
+                               return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
                         if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
                                 printk("HPFS: out of memory for EA\n");
                                 return NULL;
@@ -148,7 +148,7 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
                 }
         a = le32_to_cpu(fnode->ea_secno);
         len = le32_to_cpu(fnode->ea_size_l);
-       ano = fnode->ea_anode;
+       ano = fnode_in_anode(fnode);
         pos = 0;
         while (pos < len) {
                 char ex[4 + 255 + 1 + 8];
@@ -159,11 +159,11 @@ char *hpfs_get_ea(struct super_block *s, struct fnode *fnode, char *key, int *si
                         return NULL;
                 }
                 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return NULL;
-               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
                         return NULL;
                 if (!strcmp(ea->name, key)) {
-                       if (ea->indirect)
-                               return get_indirect_ea(s, ea->anode, ea_sec(ea), *size = ea_len(ea));
+                       if (ea_indirect(ea))
+                               return get_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), *size = ea_len(ea));
                         if (!(ret = kmalloc((*size = ea_valuelen(ea)) + 1, GFP_NOFS))) {
                                 printk("HPFS: out of memory for EA\n");
                                 return NULL;
@@ -199,9 +199,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
         struct extended_attribute *ea_end = fnode_end_ea(fnode);
         for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea))
                 if (!strcmp(ea->name, key)) {
-                       if (ea->indirect) {
+                       if (ea_indirect(ea)) {
                                 if (ea_len(ea) == size)
-                                       set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+                                       set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
                         } else if (ea_valuelen(ea) == size) {
                                 memcpy(ea_data(ea), data, size);
                         }
@@ -209,7 +209,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                 }
         a = le32_to_cpu(fnode->ea_secno);
         len = le32_to_cpu(fnode->ea_size_l);
-       ano = fnode->ea_anode;
+       ano = fnode_in_anode(fnode);
         pos = 0;
         while (pos < len) {
                 char ex[4 + 255 + 1 + 8];
@@ -220,12 +220,12 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                         return;
                 }
                 if (hpfs_ea_read(s, a, ano, pos, 4, ex)) return;
-               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea->indirect ? 8 : 0), ex + 4))
+               if (hpfs_ea_read(s, a, ano, pos + 4, ea->namelen + 1 + (ea_indirect(ea) ? 8 : 0), ex + 4))
                         return;
                 if (!strcmp(ea->name, key)) {
-                       if (ea->indirect) {
+                       if (ea_indirect(ea)) {
                                 if (ea_len(ea) == size)
-                                       set_indirect_ea(s, ea->anode, ea_sec(ea), data, size);
+                                       set_indirect_ea(s, ea_in_anode(ea), ea_sec(ea), data, size);
                         }
                         else {
                                 if (ea_valuelen(ea) == size)
@@ -246,7 +246,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
         if (le16_to_cpu(fnode->ea_offs) < 0xc4 || le16_to_cpu(fnode->ea_offs) + le16_to_cpu(fnode->acl_size_s) + le16_to_cpu(fnode->ea_size_s) > 0x200) {
                 hpfs_error(s, "fnode %08lx: ea_offs == %03x, ea_size_s == %03x",
                         (unsigned long)inode->i_ino,
-                       le32_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
+                       le16_to_cpu(fnode->ea_offs), le16_to_cpu(fnode->ea_size_s));
                 return;
         }
         if ((le16_to_cpu(fnode->ea_size_s) || !le32_to_cpu(fnode->ea_size_l)) &&
@@ -276,7 +276,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                 fnode->ea_size_l = cpu_to_le32(le16_to_cpu(fnode->ea_size_s));
                 fnode->ea_size_s = cpu_to_le16(0);
                 fnode->ea_secno = cpu_to_le32(n);
-               fnode->ea_anode = cpu_to_le32(0);
+               fnode->flags &= ~FNODE_anode;
                 mark_buffer_dirty(bh);
                 brelse(bh);
         }
@@ -288,9 +288,9 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                         secno q = hpfs_alloc_sector(s, fno, 1, 0);
                         if (!q) goto bail;
                         fnode->ea_secno = cpu_to_le32(q);
-                       fnode->ea_anode = 0;
+                       fnode->flags &= ~FNODE_anode;
                         len++;
-               } else if (!fnode->ea_anode) {
+               } else if (!fnode_in_anode(fnode)) {
                         if (hpfs_alloc_if_possible(s, le32_to_cpu(fnode->ea_secno) + len)) {
                                 len++;
                         } else {
@@ -310,7 +310,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                                 anode->u.external[0].length = cpu_to_le32(len);
                                 mark_buffer_dirty(bh);
                                 brelse(bh);
-                               fnode->ea_anode = 1;
+                               fnode->flags |= FNODE_anode;
                                 fnode->ea_secno = cpu_to_le32(a_s);*/
                                 secno new_sec;
                                 int i;
@@ -338,7 +338,7 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
                                 len = (pos + 511) >> 9;
                         }
                 }
-               if (fnode->ea_anode) {
+               if (fnode_in_anode(fnode)) {
                         if (hpfs_add_sector_to_btree(s, le32_to_cpu(fnode->ea_secno),
                                                      0, len) != -1) {
                                 len++;
@@ -351,16 +351,16 @@ void hpfs_set_ea(struct inode *inode, struct fnode *fnode, const char *key,
         h[1] = strlen(key);
         h[2] = size & 0xff;
         h[3] = size >> 8;
-       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
-       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
-       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode->ea_anode, le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
+       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l), 4, h)) goto bail;
+       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 4, h[1] + 1, key)) goto bail;
+       if (hpfs_ea_write(s, le32_to_cpu(fnode->ea_secno), fnode_in_anode(fnode), le32_to_cpu(fnode->ea_size_l) + 5 + h[1], size, data)) goto bail;
         fnode->ea_size_l = cpu_to_le32(pos);
         ret:
         hpfs_i(inode)->i_ea_size += 5 + strlen(key) + size;
         return;
         bail:
         if (le32_to_cpu(fnode->ea_secno))
-               if (fnode->ea_anode) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
+               if (fnode_in_anode(fnode)) hpfs_truncate_btree(s, le32_to_cpu(fnode->ea_secno), 1, (le32_to_cpu(fnode->ea_size_l) + 511) >> 9);
                 else hpfs_free_sectors(s, le32_to_cpu(fnode->ea_secno) + ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9), len - ((le32_to_cpu(fnode->ea_size_l) + 511) >> 9));
         else fnode->ea_secno = fnode->ea_size_l = cpu_to_le32(0);
  }
diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h

index 8b0650aae32812bac9abbb581439b592d64949d2..cce025aff1b19b86f824cd47bfb6c9457d583068 100644 (file)
--- a/fs/hpfs/hpfs.h
+++ b/fs/hpfs/hpfs.h
@@ -51,11 +51,11 @@ struct hpfs_boot_block
    u8 n_rootdir_entries[2];
    u8 n_sectors_s[2];
    u8 media_byte;
-  u16 sectors_per_fat;
-  u16 sectors_per_track;
-  u16 heads_per_cyl;
-  u32 n_hidden_sectors;
-  u32 n_sectors_l;             /* size of partition */
+  __le16 sectors_per_fat;
+  __le16 sectors_per_track;
+  __le16 heads_per_cyl;
+  __le32 n_hidden_sectors;
+  __le32 n_sectors_l;          /* size of partition */
    u8 drive_number;
    u8 mbz;
    u8 sig_28h;                  /* 28h */
@@ -63,7 +63,7 @@ struct hpfs_boot_block
    u8 vol_label[11];
    u8 sig_hpfs[8];              /* "HPFS    " */
    u8 pad[448];
-  u16 magic;                   /* aa55 */
+  __le16 magic;                        /* aa55 */
  };
  
  
@@ -75,28 +75,28 @@ struct hpfs_boot_block
  
  struct hpfs_super_block
  {
-  u32 magic;                           /* f995 e849 */
-  u32 magic1;                          /* fa53 e9c5, more magic? */
+  __le32 magic;                                /* f995 e849 */
+  __le32 magic1;                       /* fa53 e9c5, more magic? */
    u8 version;                          /* version of a filesystem  usually 2 */
    u8 funcversion;                      /* functional version - oldest version
                                            of filesystem that can understand
                                            this disk */
-  u16 zero;                            /* 0 */
-  fnode_secno root;                    /* fnode of root directory */
-  secno n_sectors;                     /* size of filesystem */
-  u32 n_badblocks;                     /* number of bad blocks */
-  secno bitmaps;                       /* pointers to free space bit maps */
-  u32 zero1;                           /* 0 */
-  secno badblocks;                     /* bad block list */
-  u32 zero3;                           /* 0 */
-  time32_t last_chkdsk;                        /* date last checked, 0 if never */
-  time32_t last_optimize;              /* date last optimized, 0 if never */
-  secno n_dir_band;                    /* number of sectors in dir band */
-  secno dir_band_start;                        /* first sector in dir band */
-  secno dir_band_end;                  /* last sector in dir band */
-  secno dir_band_bitmap;               /* free space map, 1 dnode per bit */
+  __le16 zero;                         /* 0 */
+  __le32 root;                         /* fnode of root directory */
+  __le32 n_sectors;                    /* size of filesystem */
+  __le32 n_badblocks;                  /* number of bad blocks */
+  __le32 bitmaps;                      /* pointers to free space bit maps */
+  __le32 zero1;                                /* 0 */
+  __le32 badblocks;                    /* bad block list */
+  __le32 zero3;                                /* 0 */
+  __le32 last_chkdsk;                  /* date last checked, 0 if never */
+  __le32 last_optimize;                        /* date last optimized, 0 if never */
+  __le32 n_dir_band;                   /* number of sectors in dir band */
+  __le32 dir_band_start;                       /* first sector in dir band */
+  __le32 dir_band_end;                 /* last sector in dir band */
+  __le32 dir_band_bitmap;              /* free space map, 1 dnode per bit */
    u8 volume_name[32];                  /* not used */
-  secno user_id_table;                 /* 8 preallocated sectors - user id */
+  __le32 user_id_table;                        /* 8 preallocated sectors - user id */
    u32 zero6[103];                      /* 0 */
  };
  
@@ -109,8 +109,8 @@ struct hpfs_super_block
  
  struct hpfs_spare_block
  {
-  u32 magic;                           /* f991 1849 */
-  u32 magic1;                          /* fa52 29c5, more magic? */
+  __le32 magic;                                /* f991 1849 */
+  __le32 magic1;                               /* fa52 29c5, more magic? */
  
  #ifdef __LITTLE_ENDIAN
    u8 dirty: 1;                         /* 0 clean, 1 "improperly stopped" */
@@ -153,21 +153,21 @@ struct hpfs_spare_block
    u8 mm_contlgulty;
    u8 unused;
  
-  secno hotfix_map;                    /* info about remapped bad sectors */
-  u32 n_spares_used;                   /* number of hotfixes */
-  u32 n_spares;                                /* number of spares in hotfix map */
-  u32 n_dnode_spares_free;             /* spare dnodes unused */
-  u32 n_dnode_spares;                  /* length of spare_dnodes[] list,
+  __le32 hotfix_map;                   /* info about remapped bad sectors */
+  __le32 n_spares_used;                        /* number of hotfixes */
+  __le32 n_spares;                     /* number of spares in hotfix map */
+  __le32 n_dnode_spares_free;          /* spare dnodes unused */
+  __le32 n_dnode_spares;               /* length of spare_dnodes[] list,
                                            follows in this block*/
-  secno code_page_dir;                 /* code page directory block */
-  u32 n_code_pages;                    /* number of code pages */
-  u32 super_crc;                       /* on HPFS386 and LAN Server this is
+  __le32 code_page_dir;                        /* code page directory block */
+  __le32 n_code_pages;                 /* number of code pages */
+  __le32 super_crc;                    /* on HPFS386 and LAN Server this is
                                            checksum of superblock, on normal
                                            OS/2 unused */
-  u32 spare_crc;                       /* on HPFS386 checksum of spareblock */
-  u32 zero1[15];                       /* unused */
-  dnode_secno spare_dnodes[100];       /* emergency free dnode list */
-  u32 zero2[1];                                /* room for more? */
+  __le32 spare_crc;                    /* on HPFS386 checksum of spareblock */
+  __le32 zero1[15];                    /* unused */
+  __le32 spare_dnodes[100];            /* emergency free dnode list */
+  __le32 zero2[1];                     /* room for more? */
  };
  
  /* The bad block list is 4 sectors long.  The first word must be zero,
@@ -202,18 +202,18 @@ struct hpfs_spare_block
  
  struct code_page_directory
  {
-  u32 magic;                           /* 4945 21f7 */
-  u32 n_code_pages;                    /* number of pointers following */
-  u32 zero1[2];
+  __le32 magic;                                /* 4945 21f7 */
+  __le32 n_code_pages;                 /* number of pointers following */
+  __le32 zero1[2];
    struct {
-    u16 ix;                            /* index */
-    u16 code_page_number;              /* code page number */
-    u32 bounds;                                /* matches corresponding word
+    __le16 ix;                         /* index */
+    __le16 code_page_number;           /* code page number */
+    __le32 bounds;                     /* matches corresponding word
                                            in data block */
-    secno code_page_data;              /* sector number of a code_page_data
+    __le32 code_page_data;             /* sector number of a code_page_data
                                            containing c.p. array */
-    u16 index;                         /* index in c.p. array in that sector*/
-    u16 unknown;                       /* some unknown value; usually 0;
+    __le16 index;                      /* index in c.p. array in that sector*/
+    __le16 unknown;                    /* some unknown value; usually 0;
                                            2 in Japanese version */
    } array[31];                         /* unknown length */
  };
@@ -224,19 +224,19 @@ struct code_page_directory
  
  struct code_page_data
  {
-  u32 magic;                           /* 8945 21f7 */
-  u32 n_used;                          /* # elements used in c_p_data[] */
-  u32 bounds[3];                       /* looks a bit like
+  __le32 magic;                                /* 8945 21f7 */
+  __le32 n_used;                       /* # elements used in c_p_data[] */
+  __le32 bounds[3];                    /* looks a bit like
                                              (beg1,end1), (beg2,end2)
                                            one byte each */
-  u16 offs[3];                         /* offsets from start of sector
+  __le16 offs[3];                      /* offsets from start of sector
                                            to start of c_p_data[ix] */
    struct {
-    u16 ix;                            /* index */
-    u16 code_page_number;              /* code page number */
-    u16 unknown;                       /* the same as in cp directory */
+    __le16 ix;                         /* index */
+    __le16 code_page_number;           /* code page number */
+    __le16 unknown;                    /* the same as in cp directory */
      u8 map[128];                       /* upcase table for chars 80..ff */
-    u16 zero2;
+    __le16 zero2;
    } code_page[3];
    u8 incognita[78];
  };
@@ -278,8 +278,8 @@ struct code_page_data
  #define DNODE_MAGIC   0x77e40aae
  
  struct dnode {
-  u32 magic;                           /* 77e4 0aae */
-  u32 first_free;                      /* offset from start of dnode to
+  __le32 magic;                                /* 77e4 0aae */
+  __le32 first_free;                   /* offset from start of dnode to
                                            first free dir entry */
  #ifdef __LITTLE_ENDIAN
    u8 root_dnode: 1;                    /* Is it root dnode? */
@@ -293,14 +293,14 @@ struct dnode {
    u8 root_dnode: 1;                    /* Is it root dnode? */
  #endif
    u8 increment_me2[3];
-  secno up;                            /* (root dnode) directory's fnode
+  __le32 up;                           /* (root dnode) directory's fnode
                                            (nonroot) parent dnode */
-  dnode_secno self;                    /* pointer to this dnode */
+  __le32 self;                 /* pointer to this dnode */
    u8 dirent[2028];                     /* one or more dirents */
  };
  
  struct hpfs_dirent {
-  u16 length;                          /* offset to next dirent */
+  __le16 length;                       /* offset to next dirent */
  
  #ifdef __LITTLE_ENDIAN
    u8 first: 1;                         /* set on phony ^A^A (".") entry */
@@ -346,12 +346,12 @@ struct hpfs_dirent {
    u8 read_only: 1;                     /* dos attrib */
  #endif
  
-  fnode_secno fnode;                   /* fnode giving allocation info */
-  time32_t write_date;                 /* mtime */
-  u32 file_size;                       /* file length, bytes */
-  time32_t read_date;                  /* atime */
-  time32_t creation_date;                      /* ctime */
-  u32 ea_size;                         /* total EA length, bytes */
+  __le32 fnode;                                /* fnode giving allocation info */
+  __le32 write_date;                   /* mtime */
+  __le32 file_size;                    /* file length, bytes */
+  __le32 read_date;                    /* atime */
+  __le32 creation_date;                        /* ctime */
+  __le32 ea_size;                      /* total EA length, bytes */
    u8 no_of_acls;                       /* number of ACL's (low 3 bits) */
    u8 ix;                               /* code page index (of filename), see
                                            struct code_page_data */
@@ -375,50 +375,36 @@ struct hpfs_dirent {
  
  struct bplus_leaf_node
  {
-  u32 file_secno;                      /* first file sector in extent */
-  u32 length;                          /* length, sectors */
-  secno disk_secno;                    /* first corresponding disk sector */
+  __le32 file_secno;                   /* first file sector in extent */
+  __le32 length;                       /* length, sectors */
+  __le32 disk_secno;                   /* first corresponding disk sector */
  };
  
  struct bplus_internal_node
  {
-  u32 file_secno;                      /* subtree maps sectors < this  */
-  anode_secno down;                    /* pointer to subtree */
+  __le32 file_secno;                   /* subtree maps sectors < this  */
+  __le32 down;                         /* pointer to subtree */
  };
  
+enum {
+       BP_hbff = 1,
+       BP_fnode_parent = 0x20,
+       BP_binary_search = 0x40,
+       BP_internal = 0x80
+};
  struct bplus_header
  {
-#ifdef __LITTLE_ENDIAN
-  u8 hbff: 1;                  /* high bit of first free entry offset */
-  u8 flag1234: 4;
-  u8 fnode_parent: 1;                  /* ? we're pointed to by an fnode,
-                                          the data btree or some ea or the
-                                          main ea bootage pointer ea_secno */
-                                       /* also can get set in fnodes, which
-                                          may be a chkdsk glitch or may mean
-                                          this bit is irrelevant in fnodes,
-                                          or this interpretation is all wet */
-  u8 binary_search: 1;                 /* suggest binary search (unused) */
-  u8 internal: 1;                      /* 1 -> (internal) tree of anodes
-                                          0 -> (leaf) list of extents */
-#else
-  u8 internal: 1;                      /* 1 -> (internal) tree of anodes
-                                          0 -> (leaf) list of extents */
-  u8 binary_search: 1;                 /* suggest binary search (unused) */
-  u8 fnode_parent: 1;                  /* ? we're pointed to by an fnode,
+  u8 flags;                            /* bit 0 - high bit of first free entry offset
+                                          bit 5 - we're pointed to by an fnode,
                                            the data btree or some ea or the
-                                          main ea bootage pointer ea_secno */
-                                       /* also can get set in fnodes, which
-                                          may be a chkdsk glitch or may mean
-                                          this bit is irrelevant in fnodes,
-                                          or this interpretation is all wet */
-  u8 flag1234: 4;
-  u8 hbff: 1;                  /* high bit of first free entry offset */
-#endif
+                                          main ea bootage pointer ea_secno
+                                          bit 6 - suggest binary search (unused)
+                                          bit 7 - 1 -> (internal) tree of anodes
+                                                  0 -> (leaf) list of extents */
    u8 fill[3];
    u8 n_free_nodes;                     /* free nodes in following array */
    u8 n_used_nodes;                     /* used nodes in following array */
-  u16 first_free;                      /* offset from start of header to
+  __le16 first_free;                   /* offset from start of header to
                                            first free node in array */
    union {
      struct bplus_internal_node internal[0]; /* (internal) 2-word entries giving
@@ -428,6 +414,16 @@ struct bplus_header
    } u;
  };
  
+static inline bool bp_internal(struct bplus_header *bp)
+{
+       return bp->flags & BP_internal;
+}
+
+static inline bool bp_fnode_parent(struct bplus_header *bp)
+{
+       return bp->flags & BP_fnode_parent;
+}
+
  /* fnode: root of allocation b+ tree, and EA's */
  
  /* Every file and every directory has one fnode, pointed to by the directory
@@ -436,62 +432,56 @@ struct bplus_header
  
  #define FNODE_MAGIC 0xf7e40aae
  
+enum {FNODE_anode = cpu_to_le16(2), FNODE_dir = cpu_to_le16(256)};
  struct fnode
  {
-  u32 magic;                           /* f7e4 0aae */
-  u32 zero1[2];                                /* read history */
+  __le32 magic;                                /* f7e4 0aae */
+  __le32 zero1[2];                     /* read history */
    u8 len, name[15];                    /* true length, truncated name */
-  fnode_secno up;                      /* pointer to file's directory fnode */
-  secno acl_size_l;
-  secno acl_secno;
-  u16 acl_size_s;
+  __le32 up;                           /* pointer to file's directory fnode */
+  __le32 acl_size_l;
+  __le32 acl_secno;
+  __le16 acl_size_s;
    u8 acl_anode;
    u8 zero2;                            /* history bit count */
-  u32 ea_size_l;                       /* length of disk-resident ea's */
-  secno ea_secno;                      /* first sector of disk-resident ea's*/
-  u16 ea_size_s;                       /* length of fnode-resident ea's */
-
-#ifdef __LITTLE_ENDIAN
-  u8 flag0: 1;
-  u8 ea_anode: 1;                      /* 1 -> ea_secno is an anode */
-  u8 flag234567: 6;
-#else
-  u8 flag234567: 6;
-  u8 ea_anode: 1;                      /* 1 -> ea_secno is an anode */
-  u8 flag0: 1;
-#endif
+  __le32 ea_size_l;                    /* length of disk-resident ea's */
+  __le32 ea_secno;                     /* first sector of disk-resident ea's*/
+  __le16 ea_size_s;                    /* length of fnode-resident ea's */
  
-#ifdef __LITTLE_ENDIAN
-  u8 dirflag: 1;                       /* 1 -> directory.  first & only extent
-                                          points to dnode. */
-  u8 flag9012345: 7;
-#else
-  u8 flag9012345: 7;
-  u8 dirflag: 1;                       /* 1 -> directory.  first & only extent
+  __le16 flags;                                /* bit 1 set -> ea_secno is an anode */
+                                       /* bit 8 set -> directory.  first & only extent
                                            points to dnode. */
-#endif
-
    struct bplus_header btree;           /* b+ tree, 8 extents or 12 subtrees */
    union {
      struct bplus_leaf_node external[8];
      struct bplus_internal_node internal[12];
    } u;
  
-  u32 file_size;                       /* file length, bytes */
-  u32 n_needea;                                /* number of EA's with NEEDEA set */
+  __le32 file_size;                    /* file length, bytes */
+  __le32 n_needea;                     /* number of EA's with NEEDEA set */
    u8 user_id[16];                      /* unused */
-  u16 ea_offs;                         /* offset from start of fnode
+  __le16 ea_offs;                      /* offset from start of fnode
                                            to first fnode-resident ea */
    u8 dasd_limit_treshhold;
    u8 dasd_limit_delta;
-  u32 dasd_limit;
-  u32 dasd_usage;
+  __le32 dasd_limit;
+  __le32 dasd_usage;
    u8 ea[316];                          /* zero or more EA's, packed together
                                            with no alignment padding.
                                            (Do not use this name, get here
                                            via fnode + ea_offs. I think.) */
  };
  
+static inline bool fnode_in_anode(struct fnode *p)
+{
+       return (p->flags & FNODE_anode) != 0;
+}
+
+static inline bool fnode_is_dir(struct fnode *p)
+{
+       return (p->flags & FNODE_dir) != 0;
+}
+
  
  /* anode: 99.44% pure allocation tree */
  
@@ -499,9 +489,9 @@ struct fnode
  
  struct anode
  {
-  u32 magic;                           /* 37e4 0aae */
-  anode_secno self;                    /* pointer to this anode */
-  secno up;                            /* parent anode or fnode */
+  __le32 magic;                                /* 37e4 0aae */
+  __le32 self;                         /* pointer to this anode */
+  __le32 up;                           /* parent anode or fnode */
  
    struct bplus_header btree;           /* b+tree, 40 extents or 60 subtrees */
    union {
@@ -509,7 +499,7 @@ struct anode
      struct bplus_internal_node internal[60];
    } u;
  
-  u32 fill[3];                         /* unused */
+  __le32 fill[3];                      /* unused */
  };
  
  
@@ -528,32 +518,23 @@ struct anode
     run, or in multiple runs.  Flags in the fnode tell whether the EA list
     is immediate, in a single run, or in multiple runs. */
  
+enum {EA_indirect = 1, EA_anode = 2, EA_needea = 128 };
  struct extended_attribute
  {
-#ifdef __LITTLE_ENDIAN
-  u8 indirect: 1;                      /* 1 -> value gives sector number
+  u8 flags;                            /* bit 0 set -> value gives sector number
                                            where real value starts */
-  u8 anode: 1;                         /* 1 -> sector is an anode
+                                       /* bit 1 set -> sector is an anode
                                            that points to fragmented value */
-  u8 flag23456: 5;
-  u8 needea: 1;                                /* required ea */
-#else
-  u8 needea: 1;                                /* required ea */
-  u8 flag23456: 5;
-  u8 anode: 1;                         /* 1 -> sector is an anode
-                                          that points to fragmented value */
-  u8 indirect: 1;                      /* 1 -> value gives sector number
-                                          where real value starts */
-#endif
+                                       /* bit 7 set -> required ea */
    u8 namelen;                          /* length of name, bytes */
    u8 valuelen_lo;                      /* length of value, bytes */
    u8 valuelen_hi;                      /* length of value, bytes */
-  u8 name[0];
+  u8 name[];
    /*
      u8 name[namelen];                  ascii attrib name
      u8 nul;                            terminating '\0', not counted
      u8 value[valuelen];                        value, arbitrary
-      if this.indirect, valuelen is 8 and the value is
+      if this.flags & 1, valuelen is 8 and the value is
          u32 length;                    real length of value, bytes
          secno secno;                   sector address where it starts
        if this.anode, the above sector number is the root of an anode tree
@@ -561,6 +542,16 @@ struct extended_attribute
    */
  };
  
+static inline bool ea_indirect(struct extended_attribute *ea)
+{
+       return ea->flags & EA_indirect;
+}
+
+static inline bool ea_in_anode(struct extended_attribute *ea)
+{
+       return ea->flags & EA_anode;
+}
+
  /*
     Local Variables:
     comment-column: 40
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h

index 6d2d5008fa435f22dcb876828e8b23984d3044b5..c07ef1f1ced60a0cf295772a218575d9c78e58d1 100644 (file)
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -75,7 +75,7 @@ struct hpfs_sb_info {
         unsigned char *sb_cp_table;     /* code page tables: */
                                         /*      128 bytes uppercasing table & */
                                         /*      128 bytes lowercasing table */
-       unsigned *sb_bmp_dir;           /* main bitmap directory */
+       __le32 *sb_bmp_dir;             /* main bitmap directory */
         unsigned sb_c_bitmap;           /* current bitmap */
         unsigned sb_max_fwd_alloc;      /* max forwad allocation */
         int sb_timeshift;
@@ -93,7 +93,7 @@ struct quad_buffer_head {
  static inline dnode_secno de_down_pointer (struct hpfs_dirent *de)
  {
    CHKCOND(de->down,("HPFS: de_down_pointer: !de->down\n"));
-  return le32_to_cpu(*(dnode_secno *) ((void *) de + le16_to_cpu(de->length) - 4));
+  return le32_to_cpu(*(__le32 *) ((void *) de + le16_to_cpu(de->length) - 4));
  }
  
  /* The first dir entry in a dnode */
@@ -141,12 +141,12 @@ static inline struct extended_attribute *next_ea(struct extended_attribute *ea)
  
  static inline secno ea_sec(struct extended_attribute *ea)
  {
-       return le32_to_cpu(get_unaligned((secno *)((char *)ea + 9 + ea->namelen)));
+       return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 9 + ea->namelen)));
  }
  
  static inline secno ea_len(struct extended_attribute *ea)
  {
-       return le32_to_cpu(get_unaligned((secno *)((char *)ea + 5 + ea->namelen)));
+       return le32_to_cpu(get_unaligned((__le32 *)((char *)ea + 5 + ea->namelen)));
  }
  
  static inline char *ea_data(struct extended_attribute *ea)
@@ -171,7 +171,7 @@ static inline void copy_de(struct hpfs_dirent *dst, struct hpfs_dirent *src)
         dst->not_8x3 = n;
  }
  
-static inline unsigned tstbits(u32 *bmp, unsigned b, unsigned n)
+static inline unsigned tstbits(__le32 *bmp, unsigned b, unsigned n)
  {
         int i;
         if ((b >= 0x4000) || (b + n - 1 >= 0x4000)) return n;
@@ -268,10 +268,10 @@ void hpfs_evict_inode(struct inode *);
  
  /* map.c */
  
-unsigned *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
-unsigned *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
+__le32 *hpfs_map_dnode_bitmap(struct super_block *, struct quad_buffer_head *);
+__le32 *hpfs_map_bitmap(struct super_block *, unsigned, struct quad_buffer_head *, char *);
  unsigned char *hpfs_load_code_page(struct super_block *, secno);
-secno *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
+__le32 *hpfs_load_bitmap_directory(struct super_block *, secno bmp);
  struct fnode *hpfs_map_fnode(struct super_block *s, ino_t, struct buffer_head **);
  struct anode *hpfs_map_anode(struct super_block *s, anode_secno, struct buffer_head **);
  struct dnode *hpfs_map_dnode(struct super_block *s, dnode_secno, struct quad_buffer_head *);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c

index b43066cbdc6a7cd201538cdc86e28fd9d22deac8..ed671e0ea78443b35bb6d1dd3eabc64bd7559c1f 100644 (file)
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -110,7 +110,7 @@ void hpfs_read_inode(struct inode *i)
                         }
                 }
         }
-       if (fnode->dirflag) {
+       if (fnode_is_dir(fnode)) {
                 int n_dnodes, n_subdirs;
                 i->i_mode |= S_IFDIR;
                 i->i_op = &hpfs_dir_iops;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c

index a790821366a7f045d068fe47df517dc479b0ecce..4acb19d78359d4bec83f90b854680dc3962905cf 100644 (file)
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -8,12 +8,12 @@
  
  #include "hpfs_fn.h"
  
-unsigned *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
+__le32 *hpfs_map_dnode_bitmap(struct super_block *s, struct quad_buffer_head *qbh)
  {
         return hpfs_map_4sectors(s, hpfs_sb(s)->sb_dmap, qbh, 0);
  }
  
-unsigned int *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
+__le32 *hpfs_map_bitmap(struct super_block *s, unsigned bmp_block,
                          struct quad_buffer_head *qbh, char *id)
  {
         secno sec;
@@ -89,18 +89,18 @@ unsigned char *hpfs_load_code_page(struct super_block *s, secno cps)
         return cp_table;
  }
  
-secno *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
+__le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
  {
         struct buffer_head *bh;
         int n = (hpfs_sb(s)->sb_fs_size + 0x200000 - 1) >> 21;
         int i;
-       secno *b;
+       __le32 *b;
         if (!(b = kmalloc(n * 512, GFP_KERNEL))) {
                 printk("HPFS: can't allocate memory for bitmap directory\n");
                 return NULL;
         }       
         for (i=0;i<n;i++) {
-               secno *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
+               __le32 *d = hpfs_map_sector(s, bmp+i, &bh, n - i - 1);
                 if (!d) {
                         kfree(b);
                         return NULL;
@@ -130,16 +130,16 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea
                                         (unsigned long)ino);
                                 goto bail;
                         }
-                       if (!fnode->dirflag) {
+                       if (!fnode_is_dir(fnode)) {
                                 if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes !=
-                                   (fnode->btree.internal ? 12 : 8)) {
+                                   (bp_internal(&fnode->btree) ? 12 : 8)) {
                                         hpfs_error(s,
                                            "bad number of nodes in fnode %08lx",
                                             (unsigned long)ino);
                                         goto bail;
                                 }
                                 if (le16_to_cpu(fnode->btree.first_free) !=
-                                   8 + fnode->btree.n_used_nodes * (fnode->btree.internal ? 8 : 12)) {
+                                   8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) {
                                         hpfs_error(s,
                                             "bad first_free pointer in fnode %08lx",
                                             (unsigned long)ino);
@@ -187,12 +187,12 @@ struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buff
                                 goto bail;
                         }
                         if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes !=
-                           (anode->btree.internal ? 60 : 40)) {
+                           (bp_internal(&anode->btree) ? 60 : 40)) {
                                 hpfs_error(s, "bad number of nodes in anode %08x", ano);
                                 goto bail;
                         }
                         if (le16_to_cpu(anode->btree.first_free) !=
-                           8 + anode->btree.n_used_nodes * (anode->btree.internal ? 8 : 12)) {
+                           8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) {
                                 hpfs_error(s, "bad first_free pointer in anode %08x", ano);
                                 goto bail;
                         }
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c

index 30dd7b10b507a077877d58a2bb4d5ada18ee3101..9083ef8af58c162f7fd207f7ef37263b1f35de4f 100644 (file)
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -70,7 +70,7 @@ static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         fnode->len = len;
         memcpy(fnode->name, name, len > 15 ? 15 : len);
         fnode->up = cpu_to_le32(dir->i_ino);
-       fnode->dirflag = 1;
+       fnode->flags |= FNODE_dir;
         fnode->btree.n_free_nodes = 7;
         fnode->btree.n_used_nodes = 1;
         fnode->btree.first_free = cpu_to_le16(0x14);
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c

index 54f6eccb79d9ed8c67f7ada5a96867ad4c61b37c..706a12c083ea726a7a268d647ae266b02a3a2ca7 100644 (file)
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -572,7 +572,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                 mark_buffer_dirty(bh2);
         }
  
-       if (le32_to_cpu(spareblock->hotfixes_used) || le32_to_cpu(spareblock->n_spares_used)) {
+       if (spareblock->hotfixes_used || spareblock->n_spares_used) {
                 if (errs >= 2) {
                         printk("HPFS: Hotfixes not supported here, try chkdsk\n");
                         mark_dirty(s, 0);
@@ -645,7 +645,7 @@ static int hpfs_fill_super(struct super_block *s, void *options, int silent)
                 root->i_mtime.tv_nsec = 0;
                 root->i_ctime.tv_sec = local_to_gmt(s, le32_to_cpu(de->creation_date));
                 root->i_ctime.tv_nsec = 0;
-               hpfs_i(root)->i_ea_size = le16_to_cpu(de->ea_size);
+               hpfs_i(root)->i_ea_size = le32_to_cpu(de->ea_size);
                 hpfs_i(root)->i_parent_dir = root->i_ino;
                 if (root->i_size == -1)
                         root->i_size = 2048;
diff --git a/fs/inode.c b/fs/inode.c

index c474c1d7062bcdf0aa32d86033f4f98110fb662e..c99163b1b31036ef68974c0c5dbc192f8f73f4da 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1487,10 +1487,30 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
         return 0;
  }
  
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+       if (inode->i_op->update_time)
+               return inode->i_op->update_time(inode, time, flags);
+
+       if (flags & S_ATIME)
+               inode->i_atime = *time;
+       if (flags & S_VERSION)
+               inode_inc_iversion(inode);
+       if (flags & S_CTIME)
+               inode->i_ctime = *time;
+       if (flags & S_MTIME)
+               inode->i_mtime = *time;
+       mark_inode_dirty_sync(inode);
+       return 0;
+}
+
  /**
   *     touch_atime     -       update the access time
- *     @mnt: mount the inode is accessed on
- *     @dentry: dentry accessed
+ *     @path: the &struct path to update
   *
   *     Update the accessed time on an inode and mark it for writeback.
   *     This function automatically handles read only file systems and media,
@@ -1525,12 +1545,83 @@ void touch_atime(struct path *path)
         if (mnt_want_write(mnt))
                 return;
  
-       inode->i_atime = now;
-       mark_inode_dirty_sync(inode);
+       /*
+        * File systems can error out when updating inodes if they need to
+        * allocate new space to modify an inode (such is the case for
+        * Btrfs), but since we touch atime while walking down the path we
+        * really don't care if we failed to update the atime of the file,
+        * so just ignore the return value.
+        */
+       update_time(inode, &now, S_ATIME);
         mnt_drop_write(mnt);
  }
  EXPORT_SYMBOL(touch_atime);
  
+/*
+ * The logic we want is
+ *
+ *     if suid or (sgid and xgrp)
+ *             remove privs
+ */
+int should_remove_suid(struct dentry *dentry)
+{
+       umode_t mode = dentry->d_inode->i_mode;
+       int kill = 0;
+
+       /* suid always must be killed */
+       if (unlikely(mode & S_ISUID))
+               kill = ATTR_KILL_SUID;
+
+       /*
+        * sgid without any exec bits is just a mandatory locking mark; leave
+        * it alone.  If some exec bits are set, it's a real sgid; kill it.
+        */
+       if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
+               kill |= ATTR_KILL_SGID;
+
+       if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
+               return kill;
+
+       return 0;
+}
+EXPORT_SYMBOL(should_remove_suid);
+
+static int __remove_suid(struct dentry *dentry, int kill)
+{
+       struct iattr newattrs;
+
+       newattrs.ia_valid = ATTR_FORCE | kill;
+       return notify_change(dentry, &newattrs);
+}
+
+int file_remove_suid(struct file *file)
+{
+       struct dentry *dentry = file->f_path.dentry;
+       struct inode *inode = dentry->d_inode;
+       int killsuid;
+       int killpriv;
+       int error = 0;
+
+       /* Fast path for nothing security related */
+       if (IS_NOSEC(inode))
+               return 0;
+
+       killsuid = should_remove_suid(dentry);
+       killpriv = security_inode_need_killpriv(dentry);
+
+       if (killpriv < 0)
+               return killpriv;
+       if (killpriv)
+               error = security_inode_killpriv(dentry);
+       if (!error && killsuid)
+               error = __remove_suid(dentry, killsuid);
+       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
+               inode->i_flags |= S_NOSEC;
+
+       return error;
+}
+EXPORT_SYMBOL(file_remove_suid);
+
  /**
   *     file_update_time        -       update mtime and ctime time
   *     @file: file accessed
@@ -1540,18 +1631,20 @@ EXPORT_SYMBOL(touch_atime);
   *     usage in the file write path of filesystems, and filesystems may
   *     choose to explicitly ignore update via this function with the
   *     S_NOCMTIME inode flag, e.g. for network filesystem where these
- *     timestamps are handled by the server.
+ *     timestamps are handled by the server.  This can return an error for
+ *     file systems who need to allocate space in order to update an inode.
   */
  
-void file_update_time(struct file *file)
+int file_update_time(struct file *file)
  {
         struct inode *inode = file->f_path.dentry->d_inode;
         struct timespec now;
-       enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+       int sync_it = 0;
+       int ret;
  
         /* First try to exhaust all avenues to not sync */
         if (IS_NOCMTIME(inode))
-               return;
+               return 0;
  
         now = current_fs_time(inode->i_sb);
         if (!timespec_equal(&inode->i_mtime, &now))
@@ -1564,21 +1657,16 @@ void file_update_time(struct file *file)
                 sync_it |= S_VERSION;
  
         if (!sync_it)
-               return;
+               return 0;
  
         /* Finally allowed to write? Takes lock. */
         if (mnt_want_write_file(file))
-               return;
+               return 0;
  
-       /* Only change inode inside the lock region */
-       if (sync_it & S_VERSION)
-               inode_inc_iversion(inode);
-       if (sync_it & S_CTIME)
-               inode->i_ctime = now;
-       if (sync_it & S_MTIME)
-               inode->i_mtime = now;
-       mark_inode_dirty_sync(inode);
+       ret = update_time(inode, &now, sync_it);
         mnt_drop_write_file(file);
+
+       return ret;
  }
  EXPORT_SYMBOL(file_update_time);
  
diff --git a/fs/internal.h b/fs/internal.h

index 9962c59ba280b1c75d78adc55b8491733075a5e0..18bc216ea09d95ecff126ef96987ff786b5cbcb1 100644 (file)
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -56,7 +56,7 @@ extern int sb_prepare_remount_readonly(struct super_block *);
  
  extern void __init mnt_init(void);
  
-DECLARE_BRLOCK(vfsmount_lock);
+extern struct lglock vfsmount_lock;
  
  
  /*
@@ -100,6 +100,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
  
  extern long do_handle_open(int mountdirfd,
                            struct file_handle __user *ufh, int open_flag);
+extern int open_check_o_direct(struct file *f);
  
  /*
   * inode.c
diff --git a/fs/isofs/export.c b/fs/isofs/export.c

index dd4687ff30d09900a14f113aec870007cfcfb7f0..aa4356d09eeeb03167bcf506a7fe8ad98efaba39 100644 (file)
--- a/fs/isofs/export.c
+++ b/fs/isofs/export.c
@@ -107,12 +107,11 @@ static struct dentry *isofs_export_get_parent(struct dentry *child)
  }
  
  static int
-isofs_export_encode_fh(struct dentry *dentry,
+isofs_export_encode_fh(struct inode *inode,
                        __u32 *fh32,
                        int *max_len,
-                      int connectable)
+                      struct inode *parent)
  {
-       struct inode * inode = dentry->d_inode;
         struct iso_inode_info * ei = ISOFS_I(inode);
         int len = *max_len;
         int type = 1;
@@ -124,7 +123,7 @@ isofs_export_encode_fh(struct dentry *dentry,
          * offset of the inode and the upper 16 bits of fh32[1] to
          * hold the offset of the parent.
          */
-       if (connectable && (len < 5)) {
+       if (parent && (len < 5)) {
                 *max_len = 5;
                 return 255;
         } else if (len < 3) {
@@ -136,16 +135,12 @@ isofs_export_encode_fh(struct dentry *dentry,
         fh32[0] = ei->i_iget5_block;
         fh16[2] = (__u16)ei->i_iget5_offset;  /* fh16 [sic] */
         fh32[2] = inode->i_generation;
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
+       if (parent) {
                 struct iso_inode_info *eparent;
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
                 eparent = ISOFS_I(parent);
                 fh32[3] = eparent->i_iget5_block;
                 fh16[3] = (__u16)eparent->i_iget5_offset;  /* fh16 [sic] */
                 fh32[4] = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
                 len = 5;
                 type = 2;
         }
diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig

index f32f346f4b0a521a5b6bbaedc7b0a5a7750c4b1e..69a48c2944da682c8a133fe75183c086ef08813b 100644 (file)
--- a/fs/jbd2/Kconfig
+++ b/fs/jbd2/Kconfig
@@ -1,6 +1,8 @@
  config JBD2
         tristate
         select CRC32
+       select CRYPTO
+       select CRYPTO_CRC32C
         help
           This is a generic journaling layer for block devices that support
           both 32-bit and 64-bit block numbers.  It is currently used by
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c

index 840f70f507924a0ac4db70a9d729f715783b49be..216f4299f65e7e2f1e26859c8e1247cdf71c55df 100644 (file)
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -85,6 +85,24 @@ nope:
         __brelse(bh);
  }
  
+static void jbd2_commit_block_csum_set(journal_t *j,
+                                      struct journal_head *descriptor)
+{
+       struct commit_header *h;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       h = (struct commit_header *)(jh2bh(descriptor)->b_data);
+       h->h_chksum_type = 0;
+       h->h_chksum_size = 0;
+       h->h_chksum[0] = 0;
+       csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+                          j->j_blocksize);
+       h->h_chksum[0] = cpu_to_be32(csum);
+}
+
  /*
   * Done it all: now submit the commit record.  We should have
   * cleaned up our previous buffers by now, so if we are in abort
@@ -128,6 +146,7 @@ static int journal_submit_commit_record(journal_t *journal,
                 tmp->h_chksum_size      = JBD2_CRC32_CHKSUM_SIZE;
                 tmp->h_chksum[0]        = cpu_to_be32(crc32_sum);
         }
+       jbd2_commit_block_csum_set(journal, descriptor);
  
         JBUFFER_TRACE(descriptor, "submit commit block");
         lock_buffer(bh);
@@ -301,6 +320,44 @@ static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
                 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
  }
  
+static void jbd2_descr_block_csum_set(journal_t *j,
+                                     struct journal_head *descriptor)
+{
+       struct jbd2_journal_block_tail *tail;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       tail = (struct jbd2_journal_block_tail *)
+                       (jh2bh(descriptor)->b_data + j->j_blocksize -
+                       sizeof(struct jbd2_journal_block_tail));
+       tail->t_checksum = 0;
+       csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+                          j->j_blocksize);
+       tail->t_checksum = cpu_to_be32(csum);
+}
+
+static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
+                                   struct buffer_head *bh, __u32 sequence)
+{
+       struct page *page = bh->b_page;
+       __u8 *addr;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       sequence = cpu_to_be32(sequence);
+       addr = kmap_atomic(page, KM_USER0);
+       csum = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+                         sizeof(sequence));
+       csum = jbd2_chksum(j, csum, addr + offset_in_page(bh->b_data),
+                         bh->b_size);
+       kunmap_atomic(addr, KM_USER0);
+
+       tag->t_checksum = cpu_to_be32(csum);
+}
  /*
   * jbd2_journal_commit_transaction
   *
@@ -334,6 +391,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
         unsigned long first_block;
         tid_t first_tid;
         int update_tail;
+       int csum_size = 0;
+
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               csum_size = sizeof(struct jbd2_journal_block_tail);
  
         /*
          * First job: lock down the current transaction and wait for
@@ -627,7 +688,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
  
                 tag = (journal_block_tag_t *) tagp;
                 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
-               tag->t_flags = cpu_to_be32(tag_flag);
+               tag->t_flags = cpu_to_be16(tag_flag);
+               jbd2_block_tag_csum_set(journal, tag, jh2bh(new_jh),
+                                       commit_transaction->t_tid);
                 tagp += tag_bytes;
                 space_left -= tag_bytes;
  
@@ -643,7 +706,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
  
                 if (bufs == journal->j_wbufsize ||
                     commit_transaction->t_buffers == NULL ||
-                   space_left < tag_bytes + 16) {
+                   space_left < tag_bytes + 16 + csum_size) {
  
                         jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
  
@@ -651,8 +714,9 @@ void jbd2_journal_commit_transaction(journal_t *journal)
                             submitting the IOs.  "tag" still points to
                             the last tag we set up. */
  
-                       tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
+                       tag->t_flags |= cpu_to_be16(JBD2_FLAG_LAST_TAG);
  
+                       jbd2_descr_block_csum_set(journal, descriptor);
  start_journal_io:
                         for (i = 0; i < bufs; i++) {
                                 struct buffer_head *bh = wbuf[i];
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c

index 1afb701622b0b17748b4cd7f7d171df139bcc9cc..e9a3c4c85594e30aca1ed1f14d5667ba0595160a 100644 (file)
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -97,6 +97,43 @@ EXPORT_SYMBOL(jbd2_inode_cache);
  static void __journal_abort_soft (journal_t *journal, int errno);
  static int jbd2_journal_create_slab(size_t slab_size);
  
+/* Checksumming functions */
+int jbd2_verify_csum_type(journal_t *j, journal_superblock_t *sb)
+{
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       return sb->s_checksum_type == JBD2_CRC32C_CHKSUM;
+}
+
+static __u32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+{
+       __u32 csum, old_csum;
+
+       old_csum = sb->s_checksum;
+       sb->s_checksum = 0;
+       csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+       sb->s_checksum = old_csum;
+
+       return cpu_to_be32(csum);
+}
+
+int jbd2_superblock_csum_verify(journal_t *j, journal_superblock_t *sb)
+{
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       return sb->s_checksum == jbd2_superblock_csum(j, sb);
+}
+
+void jbd2_superblock_csum_set(journal_t *j, journal_superblock_t *sb)
+{
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       sb->s_checksum = jbd2_superblock_csum(j, sb);
+}
+
  /*
   * Helper function used to manage commit timeouts
   */
@@ -1348,6 +1385,7 @@ static void jbd2_journal_update_sb_errno(journal_t *journal)
         jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
                   journal->j_errno);
         sb->s_errno    = cpu_to_be32(journal->j_errno);
+       jbd2_superblock_csum_set(journal, sb);
         read_unlock(&journal->j_state_lock);
  
         jbd2_write_superblock(journal, WRITE_SYNC);
@@ -1376,6 +1414,9 @@ static int journal_get_superblock(journal_t *journal)
                 }
         }
  
+       if (buffer_verified(bh))
+               return 0;
+
         sb = journal->j_superblock;
  
         err = -EINVAL;
@@ -1413,6 +1454,43 @@ static int journal_get_superblock(journal_t *journal)
                 goto out;
         }
  
+       if (JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM) &&
+           JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+               /* Can't have checksum v1 and v2 on at the same time! */
+               printk(KERN_ERR "JBD: Can't enable checksumming v1 and v2 "
+                      "at the same time!\n");
+               goto out;
+       }
+
+       if (!jbd2_verify_csum_type(journal, sb)) {
+               printk(KERN_ERR "JBD: Unknown checksum type\n");
+               goto out;
+       }
+
+       /* Load the checksum driver */
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+               journal->j_chksum_driver = crypto_alloc_shash("crc32c", 0, 0);
+               if (IS_ERR(journal->j_chksum_driver)) {
+                       printk(KERN_ERR "JBD: Cannot load crc32c driver.\n");
+                       err = PTR_ERR(journal->j_chksum_driver);
+                       journal->j_chksum_driver = NULL;
+                       goto out;
+               }
+       }
+
+       /* Check superblock checksum */
+       if (!jbd2_superblock_csum_verify(journal, sb)) {
+               printk(KERN_ERR "JBD: journal checksum error\n");
+               goto out;
+       }
+
+       /* Precompute checksum seed for all metadata */
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+                                                  sizeof(sb->s_uuid));
+
+       set_buffer_verified(bh);
+
         return 0;
  
  out:
@@ -1564,6 +1642,8 @@ int jbd2_journal_destroy(journal_t *journal)
                 iput(journal->j_inode);
         if (journal->j_revoke)
                 jbd2_journal_destroy_revoke(journal);
+       if (journal->j_chksum_driver)
+               crypto_free_shash(journal->j_chksum_driver);
         kfree(journal->j_wbuf);
         kfree(journal);
  
@@ -1653,6 +1733,10 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
  int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
                           unsigned long ro, unsigned long incompat)
  {
+#define INCOMPAT_FEATURE_ON(f) \
+               ((incompat & (f)) && !(sb->s_feature_incompat & cpu_to_be32(f)))
+#define COMPAT_FEATURE_ON(f) \
+               ((compat & (f)) && !(sb->s_feature_compat & cpu_to_be32(f)))
         journal_superblock_t *sb;
  
         if (jbd2_journal_check_used_features(journal, compat, ro, incompat))
@@ -1661,16 +1745,54 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
         if (!jbd2_journal_check_available_features(journal, compat, ro, incompat))
                 return 0;
  
+       /* Asking for checksumming v2 and v1?  Only give them v2. */
+       if (incompat & JBD2_FEATURE_INCOMPAT_CSUM_V2 &&
+           compat & JBD2_FEATURE_COMPAT_CHECKSUM)
+               compat &= ~JBD2_FEATURE_COMPAT_CHECKSUM;
+
         jbd_debug(1, "Setting new features 0x%lx/0x%lx/0x%lx\n",
                   compat, ro, incompat);
  
         sb = journal->j_superblock;
  
+       /* If enabling v2 checksums, update superblock */
+       if (INCOMPAT_FEATURE_ON(JBD2_FEATURE_INCOMPAT_CSUM_V2)) {
+               sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
+               sb->s_feature_compat &=
+                       ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
+
+               /* Load the checksum driver */
+               if (journal->j_chksum_driver == NULL) {
+                       journal->j_chksum_driver = crypto_alloc_shash("crc32c",
+                                                                     0, 0);
+                       if (IS_ERR(journal->j_chksum_driver)) {
+                               printk(KERN_ERR "JBD: Cannot load crc32c "
+                                      "driver.\n");
+                               journal->j_chksum_driver = NULL;
+                               return 0;
+                       }
+               }
+
+               /* Precompute checksum seed for all metadata */
+               if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                             JBD2_FEATURE_INCOMPAT_CSUM_V2))
+                       journal->j_csum_seed = jbd2_chksum(journal, ~0,
+                                                          sb->s_uuid,
+                                                          sizeof(sb->s_uuid));
+       }
+
+       /* If enabling v1 checksums, downgrade superblock */
+       if (COMPAT_FEATURE_ON(JBD2_FEATURE_COMPAT_CHECKSUM))
+               sb->s_feature_incompat &=
+                       ~cpu_to_be32(JBD2_FEATURE_INCOMPAT_CSUM_V2);
+
         sb->s_feature_compat    |= cpu_to_be32(compat);
         sb->s_feature_ro_compat |= cpu_to_be32(ro);
         sb->s_feature_incompat  |= cpu_to_be32(incompat);
  
         return 1;
+#undef COMPAT_FEATURE_ON
+#undef INCOMPAT_FEATURE_ON
  }
  
  /*
@@ -1975,10 +2097,16 @@ int jbd2_journal_blocks_per_page(struct inode *inode)
   */
  size_t journal_tag_bytes(journal_t *journal)
  {
+       journal_block_tag_t tag;
+       size_t x = 0;
+
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               x += sizeof(tag.t_checksum);
+
         if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
-               return JBD2_TAG_SIZE64;
+               return x + JBD2_TAG_SIZE64;
         else
-               return JBD2_TAG_SIZE32;
+               return x + JBD2_TAG_SIZE32;
  }
  
  /*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c

index c1a03354a22ff1b5a787251b422afcb5225ca2c9..0131e4362534c4d5b83273130ee292463ec49f07 100644 (file)
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -174,6 +174,25 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
         return 0;
  }
  
+static int jbd2_descr_block_csum_verify(journal_t *j,
+                                       void *buf)
+{
+       struct jbd2_journal_block_tail *tail;
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       tail = (struct jbd2_journal_block_tail *)(buf + j->j_blocksize -
+                       sizeof(struct jbd2_journal_block_tail));
+       provided = tail->t_checksum;
+       tail->t_checksum = 0;
+       calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+       tail->t_checksum = provided;
+
+       provided = be32_to_cpu(provided);
+       return provided == calculated;
+}
  
  /*
   * Count the number of in-use tags in a journal descriptor block.
@@ -186,6 +205,9 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
         int                     nr = 0, size = journal->j_blocksize;
         int                     tag_bytes = journal_tag_bytes(journal);
  
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               size -= sizeof(struct jbd2_journal_block_tail);
+
         tagp = &bh->b_data[sizeof(journal_header_t)];
  
         while ((tagp - bh->b_data + tag_bytes) <= size) {
@@ -193,10 +215,10 @@ static int count_tags(journal_t *journal, struct buffer_head *bh)
  
                 nr++;
                 tagp += tag_bytes;
-               if (!(tag->t_flags & cpu_to_be32(JBD2_FLAG_SAME_UUID)))
+               if (!(tag->t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
                         tagp += 16;
  
-               if (tag->t_flags & cpu_to_be32(JBD2_FLAG_LAST_TAG))
+               if (tag->t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
                         break;
         }
  
@@ -353,6 +375,41 @@ static int calc_chksums(journal_t *journal, struct buffer_head *bh,
         return 0;
  }
  
+static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
+{
+       struct commit_header *h;
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       h = buf;
+       provided = h->h_chksum[0];
+       h->h_chksum[0] = 0;
+       calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+       h->h_chksum[0] = provided;
+
+       provided = be32_to_cpu(provided);
+       return provided == calculated;
+}
+
+static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
+                                     void *buf, __u32 sequence)
+{
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       sequence = cpu_to_be32(sequence);
+       calculated = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&sequence,
+                                sizeof(sequence));
+       calculated = jbd2_chksum(j, calculated, buf, j->j_blocksize);
+       provided = be32_to_cpu(tag->t_checksum);
+
+       return provided == cpu_to_be32(calculated);
+}
+
  static int do_one_pass(journal_t *journal,
                         struct recovery_info *info, enum passtype pass)
  {
@@ -366,6 +423,7 @@ static int do_one_pass(journal_t *journal,
         int                     blocktype;
         int                     tag_bytes = journal_tag_bytes(journal);
         __u32                   crc32_sum = ~0; /* Transactional Checksums */
+       int                     descr_csum_size = 0;
  
         /*
          * First thing is to establish what we expect to find in the log
@@ -451,6 +509,18 @@ static int do_one_pass(journal_t *journal,
  
                 switch(blocktype) {
                 case JBD2_DESCRIPTOR_BLOCK:
+                       /* Verify checksum first */
+                       if (JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                       JBD2_FEATURE_INCOMPAT_CSUM_V2))
+                               descr_csum_size =
+                                       sizeof(struct jbd2_journal_block_tail);
+                       if (descr_csum_size > 0 &&
+                           !jbd2_descr_block_csum_verify(journal,
+                                                         bh->b_data)) {
+                               err = -EIO;
+                               goto failed;
+                       }
+
                         /* If it is a valid descriptor block, replay it
                          * in pass REPLAY; if journal_checksums enabled, then
                          * calculate checksums in PASS_SCAN, otherwise,
@@ -481,11 +551,11 @@ static int do_one_pass(journal_t *journal,
  
                         tagp = &bh->b_data[sizeof(journal_header_t)];
                         while ((tagp - bh->b_data + tag_bytes)
-                              <= journal->j_blocksize) {
+                              <= journal->j_blocksize - descr_csum_size) {
                                 unsigned long io_block;
  
                                 tag = (journal_block_tag_t *) tagp;
-                               flags = be32_to_cpu(tag->t_flags);
+                               flags = be16_to_cpu(tag->t_flags);
  
                                 io_block = next_log_block++;
                                 wrap(journal, next_log_block);
@@ -516,6 +586,19 @@ static int do_one_pass(journal_t *journal,
                                                 goto skip_write;
                                         }
  
+                                       /* Look for block corruption */
+                                       if (!jbd2_block_tag_csum_verify(
+                                               journal, tag, obh->b_data,
+                                               be32_to_cpu(tmp->h_sequence))) {
+                                               brelse(obh);
+                                               success = -EIO;
+                                               printk(KERN_ERR "JBD: Invalid "
+                                                      "checksum recovering "
+                                                      "block %llu in log\n",
+                                                      blocknr);
+                                               continue;
+                                       }
+
                                         /* Find a buffer for the new
                                          * data being restored */
                                         nbh = __getblk(journal->j_fs_dev,
@@ -650,6 +733,19 @@ static int do_one_pass(journal_t *journal,
                                 }
                                 crc32_sum = ~0;
                         }
+                       if (pass == PASS_SCAN &&
+                           !jbd2_commit_block_csum_verify(journal,
+                                                          bh->b_data)) {
+                               info->end_transaction = next_commit_ID;
+
+                               if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
+                                    JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                                       journal->j_failed_commit =
+                                               next_commit_ID;
+                                       brelse(bh);
+                                       break;
+                               }
+                       }
                         brelse(bh);
                         next_commit_ID++;
                         continue;
@@ -706,6 +802,25 @@ static int do_one_pass(journal_t *journal,
         return err;
  }
  
+static int jbd2_revoke_block_csum_verify(journal_t *j,
+                                        void *buf)
+{
+       struct jbd2_journal_revoke_tail *tail;
+       __u32 provided, calculated;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return 1;
+
+       tail = (struct jbd2_journal_revoke_tail *)(buf + j->j_blocksize -
+                       sizeof(struct jbd2_journal_revoke_tail));
+       provided = tail->r_checksum;
+       tail->r_checksum = 0;
+       calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+       tail->r_checksum = provided;
+
+       provided = be32_to_cpu(provided);
+       return provided == calculated;
+}
  
  /* Scan a revoke record, marking all blocks mentioned as revoked. */
  
@@ -720,6 +835,9 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
         offset = sizeof(jbd2_journal_revoke_header_t);
         max = be32_to_cpu(header->r_count);
  
+       if (!jbd2_revoke_block_csum_verify(journal, header))
+               return -EINVAL;
+
         if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_64BIT))
                 record_len = 8;
  
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c

index 6973705d6a3d9db1c96ed67f55c97c8a13ee2f6d..f30b80b4ce8bef98cab621bf731e13682661ca6d 100644 (file)
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -578,6 +578,7 @@ static void write_one_revoke_record(journal_t *journal,
                                     struct jbd2_revoke_record_s *record,
                                     int write_op)
  {
+       int csum_size = 0;
         struct journal_head *descriptor;
         int offset;
         journal_header_t *header;
@@ -592,9 +593,13 @@ static void write_one_revoke_record(journal_t *journal,
         descriptor = *descriptorp;
         offset = *offsetp;
  
+       /* Do we need to leave space at the end for a checksum? */
+       if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               csum_size = sizeof(struct jbd2_journal_revoke_tail);
+
         /* Make sure we have a descriptor with space left for the record */
         if (descriptor) {
-               if (offset == journal->j_blocksize) {
+               if (offset >= journal->j_blocksize - csum_size) {
                         flush_descriptor(journal, descriptor, offset, write_op);
                         descriptor = NULL;
                 }
@@ -631,6 +636,24 @@ static void write_one_revoke_record(journal_t *journal,
         *offsetp = offset;
  }
  
+static void jbd2_revoke_csum_set(journal_t *j,
+                                struct journal_head *descriptor)
+{
+       struct jbd2_journal_revoke_tail *tail;
+       __u32 csum;
+
+       if (!JBD2_HAS_INCOMPAT_FEATURE(j, JBD2_FEATURE_INCOMPAT_CSUM_V2))
+               return;
+
+       tail = (struct jbd2_journal_revoke_tail *)
+                       (jh2bh(descriptor)->b_data + j->j_blocksize -
+                       sizeof(struct jbd2_journal_revoke_tail));
+       tail->r_checksum = 0;
+       csum = jbd2_chksum(j, j->j_csum_seed, jh2bh(descriptor)->b_data,
+                          j->j_blocksize);
+       tail->r_checksum = cpu_to_be32(csum);
+}
+
  /*
   * Flush a revoke descriptor out to the journal.  If we are aborting,
   * this is a noop; otherwise we are generating a buffer which needs to
@@ -652,6 +675,8 @@ static void flush_descriptor(journal_t *journal,
  
         header = (jbd2_journal_revoke_header_t *) jh2bh(descriptor)->b_data;
         header->r_count = cpu_to_be32(offset);
+       jbd2_revoke_csum_set(journal, descriptor);
+
         set_buffer_jwrite(bh);
         BUFFER_TRACE(bh, "write");
         set_buffer_dirty(bh);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c

index ddcd3549c6c26cbc9cb9dd46831b189ed3c0441e..fb1ab9533b67277a557cd5f8ea9f7216b8284d4e 100644 (file)
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -162,8 +162,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
  
  alloc_transaction:
         if (!journal->j_running_transaction) {
-               new_transaction = kmem_cache_alloc(transaction_cache,
-                                                  gfp_mask | __GFP_ZERO);
+               new_transaction = kmem_cache_zalloc(transaction_cache,
+                                                   gfp_mask);
                 if (!new_transaction) {
                         /*
                          * If __GFP_FS is not present, then we may be
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h

index 55a0c1dceadfddcf990b8fdbcfec015fc75fab32..44dca1f041c5cbc2b057e3eb7e13840c3edd25d7 100644 (file)
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -126,6 +126,10 @@ struct jffs2_sb_info {
         struct jffs2_inodirty *wbuf_inodes;
         struct rw_semaphore wbuf_sem;   /* Protects the write buffer */
  
+       struct delayed_work wbuf_dwork; /* write-buffer write-out work */
+       int wbuf_queued;                /* non-zero delayed work is queued */
+       spinlock_t wbuf_dwork_lock;     /* protects wbuf_dwork and and wbuf_queued */
+
         unsigned char *oobbuf;
         int oobavail; /* How many bytes are available for JFFS2 in OOB */
  #endif
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h

index 1cd3aec9d9ae282dd31226d0717aaf69a55f414d..bcd983d7e7f99e7e295decc1d26092d464a14d9f 100644 (file)
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -95,6 +95,7 @@ static inline void jffs2_init_inode_info(struct jffs2_inode_info *f)
  #define jffs2_ubivol(c) (0)
  #define jffs2_ubivol_setup(c) (0)
  #define jffs2_ubivol_cleanup(c) do {} while (0)
+#define jffs2_dirty_trigger(c) do {} while (0)
  
  #else /* NAND and/or ECC'd NOR support present */
  
@@ -135,14 +136,10 @@ void jffs2_ubivol_cleanup(struct jffs2_sb_info *c);
  #define jffs2_nor_wbuf_flash(c) (c->mtd->type == MTD_NORFLASH && ! (c->mtd->flags & MTD_BIT_WRITEABLE))
  int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c);
  void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c);
+void jffs2_dirty_trigger(struct jffs2_sb_info *c);
  
  #endif /* WRITEBUFFER */
  
-static inline void jffs2_dirty_trigger(struct jffs2_sb_info *c)
-{
-       OFNI_BS_2SFFJ(c)->s_dirt = 1;
-}
-
  /* background.c */
  int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c);
  void jffs2_stop_garbage_collect_thread(struct jffs2_sb_info *c);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c

index f9916f312bd81e3590fde1c92a025458cb64ab11..bc586f204228633ed2d3bcf388991a0ec4b21ff8 100644 (file)
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -63,21 +63,6 @@ static void jffs2_i_init_once(void *foo)
         inode_init_once(&f->vfs_inode);
  }
  
-static void jffs2_write_super(struct super_block *sb)
-{
-       struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-
-       lock_super(sb);
-       sb->s_dirt = 0;
-
-       if (!(sb->s_flags & MS_RDONLY)) {
-               jffs2_dbg(1, "%s()\n", __func__);
-               jffs2_flush_wbuf_gc(c, 0);
-       }
-
-       unlock_super(sb);
-}
-
  static const char *jffs2_compr_name(unsigned int compr)
  {
         switch (compr) {
@@ -113,8 +98,6 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
  {
         struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
  
-       jffs2_write_super(sb);
-
         mutex_lock(&c->alloc_sem);
         jffs2_flush_wbuf_pad(c);
         mutex_unlock(&c->alloc_sem);
@@ -251,7 +234,6 @@ static const struct super_operations jffs2_super_operations =
         .alloc_inode =  jffs2_alloc_inode,
         .destroy_inode =jffs2_destroy_inode,
         .put_super =    jffs2_put_super,
-       .write_super =  jffs2_write_super,
         .statfs =       jffs2_statfs,
         .remount_fs =   jffs2_remount_fs,
         .evict_inode =  jffs2_evict_inode,
@@ -319,9 +301,6 @@ static void jffs2_put_super (struct super_block *sb)
  
         jffs2_dbg(2, "%s()\n", __func__);
  
-       if (sb->s_dirt)
-               jffs2_write_super(sb);
-
         mutex_lock(&c->alloc_sem);
         jffs2_flush_wbuf_pad(c);
         mutex_unlock(&c->alloc_sem);
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c

index 74d9be19df3f1fff1d7defdc7824c90240a302f6..6f4529d3697fd3f97d5b018dbe9f5c0362cee034 100644 (file)
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -20,6 +20,7 @@
  #include <linux/mtd/nand.h>
  #include <linux/jiffies.h>
  #include <linux/sched.h>
+#include <linux/writeback.h>
  
  #include "nodelist.h"
  
@@ -85,7 +86,7 @@ static void jffs2_wbuf_dirties_inode(struct jffs2_sb_info *c, uint32_t ino)
  {
         struct jffs2_inodirty *new;
  
-       /* Mark the superblock dirty so that kupdated will flush... */
+       /* Schedule delayed write-buffer write-out */
         jffs2_dirty_trigger(c);
  
         if (jffs2_wbuf_pending_for_ino(c, ino))
@@ -1148,6 +1149,47 @@ int jffs2_write_nand_badblock(struct jffs2_sb_info *c, struct jffs2_eraseblock *
         return 1;
  }
  
+static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
+{
+       struct delayed_work *dwork;
+
+       dwork = container_of(work, struct delayed_work, work);
+       return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
+}
+
+static void delayed_wbuf_sync(struct work_struct *work)
+{
+       struct jffs2_sb_info *c = work_to_sb(work);
+       struct super_block *sb = OFNI_BS_2SFFJ(c);
+
+       spin_lock(&c->wbuf_dwork_lock);
+       c->wbuf_queued = 0;
+       spin_unlock(&c->wbuf_dwork_lock);
+
+       if (!(sb->s_flags & MS_RDONLY)) {
+               jffs2_dbg(1, "%s()\n", __func__);
+               jffs2_flush_wbuf_gc(c, 0);
+       }
+}
+
+void jffs2_dirty_trigger(struct jffs2_sb_info *c)
+{
+       struct super_block *sb = OFNI_BS_2SFFJ(c);
+       unsigned long delay;
+
+       if (sb->s_flags & MS_RDONLY)
+               return;
+
+       spin_lock(&c->wbuf_dwork_lock);
+       if (!c->wbuf_queued) {
+               jffs2_dbg(1, "%s()\n", __func__);
+               delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+               queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay);
+               c->wbuf_queued = 1;
+       }
+       spin_unlock(&c->wbuf_dwork_lock);
+}
+
  int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
  {
         struct nand_ecclayout *oinfo = c->mtd->ecclayout;
@@ -1169,6 +1211,8 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c)
  
         /* Initialise write buffer */
         init_rwsem(&c->wbuf_sem);
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
         c->wbuf_pagesize = c->mtd->writesize;
         c->wbuf_ofs = 0xFFFFFFFF;
  
@@ -1207,8 +1251,8 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) {
  
         /* Initialize write buffer */
         init_rwsem(&c->wbuf_sem);
-
-
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
         c->wbuf_pagesize =  c->mtd->erasesize;
  
         /* Find a suitable c->sector_size
@@ -1267,6 +1311,9 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
  
         /* Initialize write buffer */
         init_rwsem(&c->wbuf_sem);
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
+
         c->wbuf_pagesize = c->mtd->writesize;
         c->wbuf_ofs = 0xFFFFFFFF;
  
@@ -1299,6 +1346,8 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) {
                 return 0;
  
         init_rwsem(&c->wbuf_sem);
+       spin_lock_init(&c->wbuf_dwork_lock);
+       INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync);
  
         c->wbuf_pagesize =  c->mtd->writesize;
         c->wbuf_ofs = 0xFFFFFFFF;
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c

index 1ead0750cdbb00d680320ffb5b5747a504500b44..80938fda67e0e6fde67999d3556820b87b6acd33 100644 (file)
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -251,39 +251,40 @@ out_err:
         return err;
  }
  
-static int lockd_up_net(struct net *net)
+static int lockd_up_net(struct svc_serv *serv, struct net *net)
  {
         struct lockd_net *ln = net_generic(net, lockd_net_id);
-       struct svc_serv *serv = nlmsvc_rqst->rq_server;
         int error;
  
-       if (ln->nlmsvc_users)
+       if (ln->nlmsvc_users++)
                 return 0;
  
-       error = svc_rpcb_setup(serv, net);
+       error = svc_bind(serv, net);
         if (error)
-               goto err_rpcb;
+               goto err_bind;
  
         error = make_socks(serv, net);
         if (error < 0)
                 goto err_socks;
+       dprintk("lockd_up_net: per-net data created; net=%p\n", net);
         return 0;
  
  err_socks:
         svc_rpcb_cleanup(serv, net);
-err_rpcb:
+err_bind:
+       ln->nlmsvc_users--;
         return error;
  }
  
-static void lockd_down_net(struct net *net)
+static void lockd_down_net(struct svc_serv *serv, struct net *net)
  {
         struct lockd_net *ln = net_generic(net, lockd_net_id);
-       struct svc_serv *serv = nlmsvc_rqst->rq_server;
  
         if (ln->nlmsvc_users) {
                 if (--ln->nlmsvc_users == 0) {
                         nlm_shutdown_hosts_net(net);
                         svc_shutdown_net(serv, net);
+                       dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net);
                 }
         } else {
                 printk(KERN_ERR "lockd_down_net: no users! task=%p, net=%p\n",
@@ -292,21 +293,60 @@ static void lockd_down_net(struct net *net)
         }
  }
  
-/*
- * Bring up the lockd process if it's not already up.
- */
-int lockd_up(struct net *net)
+static int lockd_start_svc(struct svc_serv *serv)
+{
+       int error;
+
+       if (nlmsvc_rqst)
+               return 0;
+
+       /*
+        * Create the kernel thread and wait for it to start.
+        */
+       nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
+       if (IS_ERR(nlmsvc_rqst)) {
+               error = PTR_ERR(nlmsvc_rqst);
+               printk(KERN_WARNING
+                       "lockd_up: svc_rqst allocation failed, error=%d\n",
+                       error);
+               goto out_rqst;
+       }
+
+       svc_sock_update_bufs(serv);
+       serv->sv_maxconn = nlm_max_connections;
+
+       nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
+       if (IS_ERR(nlmsvc_task)) {
+               error = PTR_ERR(nlmsvc_task);
+               printk(KERN_WARNING
+                       "lockd_up: kthread_run failed, error=%d\n", error);
+               goto out_task;
+       }
+       dprintk("lockd_up: service started\n");
+       return 0;
+
+out_task:
+       svc_exit_thread(nlmsvc_rqst);
+       nlmsvc_task = NULL;
+out_rqst:
+       nlmsvc_rqst = NULL;
+       return error;
+}
+
+static struct svc_serv *lockd_create_svc(void)
  {
         struct svc_serv *serv;
-       int             error = 0;
  
-       mutex_lock(&nlmsvc_mutex);
         /*
          * Check whether we're already up and running.
          */
         if (nlmsvc_rqst) {
-               error = lockd_up_net(net);
-               goto out;
+               /*
+                * Note: increase service usage, because later in case of error
+                * svc_destroy() will be called.
+                */
+               svc_get(nlmsvc_rqst->rq_server);
+               return nlmsvc_rqst->rq_server;
         }
  
         /*
@@ -317,59 +357,53 @@ int lockd_up(struct net *net)
                 printk(KERN_WARNING
                         "lockd_up: no pid, %d users??\n", nlmsvc_users);
  
-       error = -ENOMEM;
         serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
         if (!serv) {
                 printk(KERN_WARNING "lockd_up: create service failed\n");
-               goto out;
+               return ERR_PTR(-ENOMEM);
         }
+       dprintk("lockd_up: service created\n");
+       return serv;
+}
  
-       error = make_socks(serv, net);
-       if (error < 0)
-               goto destroy_and_out;
+/*
+ * Bring up the lockd process if it's not already up.
+ */
+int lockd_up(struct net *net)
+{
+       struct svc_serv *serv;
+       int error;
  
-       /*
-        * Create the kernel thread and wait for it to start.
-        */
-       nlmsvc_rqst = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-       if (IS_ERR(nlmsvc_rqst)) {
-               error = PTR_ERR(nlmsvc_rqst);
-               nlmsvc_rqst = NULL;
-               printk(KERN_WARNING
-                       "lockd_up: svc_rqst allocation failed, error=%d\n",
-                       error);
-               goto destroy_and_out;
+       mutex_lock(&nlmsvc_mutex);
+
+       serv = lockd_create_svc();
+       if (IS_ERR(serv)) {
+               error = PTR_ERR(serv);
+               goto err_create;
         }
  
-       svc_sock_update_bufs(serv);
-       serv->sv_maxconn = nlm_max_connections;
+       error = lockd_up_net(serv, net);
+       if (error < 0)
+               goto err_net;
  
-       nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name);
-       if (IS_ERR(nlmsvc_task)) {
-               error = PTR_ERR(nlmsvc_task);
-               svc_exit_thread(nlmsvc_rqst);
-               nlmsvc_task = NULL;
-               nlmsvc_rqst = NULL;
-               printk(KERN_WARNING
-                       "lockd_up: kthread_run failed, error=%d\n", error);
-               goto destroy_and_out;
-       }
+       error = lockd_start_svc(serv);
+       if (error < 0)
+               goto err_start;
  
+       nlmsvc_users++;
         /*
          * Note: svc_serv structures have an initial use count of 1,
          * so we exit through here on both success and failure.
          */
-destroy_and_out:
+err_net:
         svc_destroy(serv);
-out:
-       if (!error) {
-               struct lockd_net *ln = net_generic(net, lockd_net_id);
-
-               ln->nlmsvc_users++;
-               nlmsvc_users++;
-       }
+err_create:
         mutex_unlock(&nlmsvc_mutex);
         return error;
+
+err_start:
+       lockd_down_net(serv, net);
+       goto err_net;
  }
  EXPORT_SYMBOL_GPL(lockd_up);
  
@@ -380,11 +414,10 @@ void
  lockd_down(struct net *net)
  {
         mutex_lock(&nlmsvc_mutex);
+       lockd_down_net(nlmsvc_rqst->rq_server, net);
         if (nlmsvc_users) {
-               if (--nlmsvc_users) {
-                       lockd_down_net(net);
+               if (--nlmsvc_users)
                         goto out;
-               }
         } else {
                 printk(KERN_ERR "lockd_down: no users! task=%p\n",
                         nlmsvc_task);
@@ -396,7 +429,9 @@ lockd_down(struct net *net)
                 BUG();
         }
         kthread_stop(nlmsvc_task);
+       dprintk("lockd_down: service stopped\n");
         svc_exit_thread(nlmsvc_rqst);
+       dprintk("lockd_down: service destroyed\n");
         nlmsvc_task = NULL;
         nlmsvc_rqst = NULL;
  out:
diff --git a/fs/locks.c b/fs/locks.c

index 4f441e46cef47bc67b08a3e82b78f389dfbbf818..814c51d0de4739e4b89e9091e00c17f284c0e2ba 100644 (file)
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1636,12 +1636,13 @@ EXPORT_SYMBOL(flock_lock_file_wait);
  SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
  {
         struct file *filp;
+       int fput_needed;
         struct file_lock *lock;
         int can_sleep, unlock;
         int error;
  
         error = -EBADF;
-       filp = fget(fd);
+       filp = fget_light(fd, &fput_needed);
         if (!filp)
                 goto out;
  
@@ -1674,7 +1675,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
         locks_free_lock(lock);
  
   out_putf:
-       fput(filp);
+       fput_light(filp, fput_needed);
   out:
         return error;
  }
diff --git a/fs/namei.c b/fs/namei.c

index c651f02c9fecb930c97a2668adc075090c04b7c9..7d694194024ac4d2459e7cc3d60014bdff64e3ba 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -449,7 +449,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry)
         mntget(nd->path.mnt);
  
         rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
         nd->flags &= ~LOOKUP_RCU;
         return 0;
  
@@ -507,14 +507,14 @@ static int complete_walk(struct nameidata *nd)
                 if (unlikely(!__d_rcu_to_refcount(dentry, nd->seq))) {
                         spin_unlock(&dentry->d_lock);
                         rcu_read_unlock();
-                       br_read_unlock(vfsmount_lock);
+                       br_read_unlock(&vfsmount_lock);
                         return -ECHILD;
                 }
                 BUG_ON(nd->inode != dentry->d_inode);
                 spin_unlock(&dentry->d_lock);
                 mntget(nd->path.mnt);
                 rcu_read_unlock();
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
         }
  
         if (likely(!(nd->flags & LOOKUP_JUMPED)))
@@ -681,15 +681,15 @@ int follow_up(struct path *path)
         struct mount *parent;
         struct dentry *mountpoint;
  
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
         parent = mnt->mnt_parent;
         if (&parent->mnt == path->mnt) {
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                 return 0;
         }
         mntget(&parent->mnt);
         mountpoint = dget(mnt->mnt_mountpoint);
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
         dput(path->dentry);
         path->dentry = mountpoint;
         mntput(path->mnt);
@@ -947,7 +947,7 @@ failed:
         if (!(nd->flags & LOOKUP_ROOT))
                 nd->root.mnt = NULL;
         rcu_read_unlock();
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
         return -ECHILD;
  }
  
@@ -1125,8 +1125,8 @@ static struct dentry *__lookup_hash(struct qstr *name,
   *  small and for now I'd prefer to have fast path as straight as possible.
   *  It _is_ time-critical.
   */
-static int do_lookup(struct nameidata *nd, struct qstr *name,
-                       struct path *path, struct inode **inode)
+static int lookup_fast(struct nameidata *nd, struct qstr *name,
+                      struct path *path, struct inode **inode)
  {
         struct vfsmount *mnt = nd->path.mnt;
         struct dentry *dentry, *parent = nd->path.dentry;
@@ -1208,7 +1208,7 @@ unlazy:
                         goto need_lookup;
                 }
         }
-done:
+
         path->mnt = mnt;
         path->dentry = dentry;
         err = follow_managed(path, nd->flags);
@@ -1222,6 +1222,17 @@ done:
         return 0;
  
  need_lookup:
+       return 1;
+}
+
+/* Fast lookup failed, do it the slow way */
+static int lookup_slow(struct nameidata *nd, struct qstr *name,
+                      struct path *path)
+{
+       struct dentry *dentry, *parent;
+       int err;
+
+       parent = nd->path.dentry;
         BUG_ON(nd->inode != parent->d_inode);
  
         mutex_lock(&parent->d_inode->i_mutex);
@@ -1229,7 +1240,16 @@ need_lookup:
         mutex_unlock(&parent->d_inode->i_mutex);
         if (IS_ERR(dentry))
                 return PTR_ERR(dentry);
-       goto done;
+       path->mnt = nd->path.mnt;
+       path->dentry = dentry;
+       err = follow_managed(path, nd->flags);
+       if (unlikely(err < 0)) {
+               path_put_conditional(path, nd);
+               return err;
+       }
+       if (err)
+               nd->flags |= LOOKUP_JUMPED;
+       return 0;
  }
  
  static inline int may_lookup(struct nameidata *nd)
@@ -1265,7 +1285,7 @@ static void terminate_walk(struct nameidata *nd)
                 if (!(nd->flags & LOOKUP_ROOT))
                         nd->root.mnt = NULL;
                 rcu_read_unlock();
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
         }
  }
  
@@ -1301,21 +1321,26 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
          */
         if (unlikely(type != LAST_NORM))
                 return handle_dots(nd, type);
-       err = do_lookup(nd, name, path, &inode);
+       err = lookup_fast(nd, name, path, &inode);
         if (unlikely(err)) {
-               terminate_walk(nd);
-               return err;
-       }
-       if (!inode) {
-               path_to_nameidata(path, nd);
-               terminate_walk(nd);
-               return -ENOENT;
+               if (err < 0)
+                       goto out_err;
+
+               err = lookup_slow(nd, name, path);
+               if (err < 0)
+                       goto out_err;
+
+               inode = path->dentry->d_inode;
         }
+       err = -ENOENT;
+       if (!inode)
+               goto out_path_put;
+
         if (should_follow_link(inode, follow)) {
                 if (nd->flags & LOOKUP_RCU) {
                         if (unlikely(unlazy_walk(nd, path->dentry))) {
-                               terminate_walk(nd);
-                               return -ECHILD;
+                               err = -ECHILD;
+                               goto out_err;
                         }
                 }
                 BUG_ON(inode != path->dentry->d_inode);
@@ -1324,6 +1349,12 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
         path_to_nameidata(path, nd);
         nd->inode = inode;
         return 0;
+
+out_path_put:
+       path_to_nameidata(path, nd);
+out_err:
+       terminate_walk(nd);
+       return err;
  }
  
  /*
@@ -1620,7 +1651,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                 nd->path = nd->root;
                 nd->inode = inode;
                 if (flags & LOOKUP_RCU) {
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                         rcu_read_lock();
                         nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
                 } else {
@@ -1633,7 +1664,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
  
         if (*name=='/') {
                 if (flags & LOOKUP_RCU) {
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                         rcu_read_lock();
                         set_root_rcu(nd);
                 } else {
@@ -1646,7 +1677,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                         struct fs_struct *fs = current->fs;
                         unsigned seq;
  
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                         rcu_read_lock();
  
                         do {
@@ -1682,7 +1713,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
                         if (fput_needed)
                                 *fp = file;
                         nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
-                       br_read_lock(vfsmount_lock);
+                       br_read_lock(&vfsmount_lock);
                         rcu_read_lock();
                 } else {
                         path_get(&file->f_path);
@@ -2169,6 +2200,10 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
         int want_write = 0;
         int acc_mode = op->acc_mode;
         struct file *filp;
+       struct inode *inode;
+       int symlink_ok = 0;
+       struct path save_parent = { .dentry = NULL, .mnt = NULL };
+       bool retried = false;
         int error;
  
         nd->flags &= ~LOOKUP_PARENT;
@@ -2200,30 +2235,23 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
         }
  
         if (!(open_flag & O_CREAT)) {
-               int symlink_ok = 0;
                 if (nd->last.name[nd->last.len])
                         nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
                 if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW))
                         symlink_ok = 1;
                 /* we _can_ be in RCU mode here */
-               error = walk_component(nd, path, &nd->last, LAST_NORM,
-                                       !symlink_ok);
-               if (error < 0)
-                       return ERR_PTR(error);
-               if (error) /* symlink */
-                       return NULL;
-               /* sayonara */
-               error = complete_walk(nd);
-               if (error)
-                       return ERR_PTR(error);
+               error = lookup_fast(nd, &nd->last, path, &inode);
+               if (unlikely(error)) {
+                       if (error < 0)
+                               goto exit;
  
-               error = -ENOTDIR;
-               if (nd->flags & LOOKUP_DIRECTORY) {
-                       if (!nd->inode->i_op->lookup)
+                       error = lookup_slow(nd, &nd->last, path);
+                       if (error < 0)
                                 goto exit;
+
+                       inode = path->dentry->d_inode;
                 }
-               audit_inode(pathname, nd->path.dentry);
-               goto ok;
+               goto finish_lookup;
         }
  
         /* create side of things */
@@ -2241,6 +2269,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
         if (nd->last.name[nd->last.len])
                 goto exit;
  
+retry_lookup:
         mutex_lock(&dir->d_inode->i_mutex);
  
         dentry = lookup_hash(nd);
@@ -2302,22 +2331,49 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
         if (error)
                 nd->flags |= LOOKUP_JUMPED;
  
+       BUG_ON(nd->flags & LOOKUP_RCU);
+       inode = path->dentry->d_inode;
+finish_lookup:
+       /* we _can_ be in RCU mode here */
         error = -ENOENT;
-       if (!path->dentry->d_inode)
-               goto exit_dput;
+       if (!inode) {
+               path_to_nameidata(path, nd);
+               goto exit;
+       }
  
-       if (path->dentry->d_inode->i_op->follow_link)
+       if (should_follow_link(inode, !symlink_ok)) {
+               if (nd->flags & LOOKUP_RCU) {
+                       if (unlikely(unlazy_walk(nd, path->dentry))) {
+                               error = -ECHILD;
+                               goto exit;
+                       }
+               }
+               BUG_ON(inode != path->dentry->d_inode);
                 return NULL;
+       }
  
-       path_to_nameidata(path, nd);
-       nd->inode = path->dentry->d_inode;
+       if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) {
+               path_to_nameidata(path, nd);
+       } else {
+               save_parent.dentry = nd->path.dentry;
+               save_parent.mnt = mntget(path->mnt);
+               nd->path.dentry = path->dentry;
+
+       }
+       nd->inode = inode;
         /* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
         error = complete_walk(nd);
-       if (error)
+       if (error) {
+               path_put(&save_parent);
                 return ERR_PTR(error);
+       }
         error = -EISDIR;
-       if (S_ISDIR(nd->inode->i_mode))
+       if ((open_flag & O_CREAT) && S_ISDIR(nd->inode->i_mode))
+               goto exit;
+       error = -ENOTDIR;
+       if ((nd->flags & LOOKUP_DIRECTORY) && !nd->inode->i_op->lookup)
                 goto exit;
+       audit_inode(pathname, nd->path.dentry);
  ok:
         if (!S_ISREG(nd->inode->i_mode))
                 will_truncate = 0;
@@ -2333,6 +2389,20 @@ common:
         if (error)
                 goto exit;
         filp = nameidata_to_filp(nd);
+       if (filp == ERR_PTR(-EOPENSTALE) && save_parent.dentry && !retried) {
+               BUG_ON(save_parent.dentry != dir);
+               path_put(&nd->path);
+               nd->path = save_parent;
+               nd->inode = dir->d_inode;
+               save_parent.mnt = NULL;
+               save_parent.dentry = NULL;
+               if (want_write) {
+                       mnt_drop_write(nd->path.mnt);
+                       want_write = 0;
+               }
+               retried = true;
+               goto retry_lookup;
+       }
         if (!IS_ERR(filp)) {
                 error = ima_file_check(filp, op->acc_mode);
                 if (error) {
@@ -2352,7 +2422,8 @@ common:
  out:
         if (want_write)
                 mnt_drop_write(nd->path.mnt);
-       path_put(&nd->path);
+       path_put(&save_parent);
+       terminate_walk(nd);
         return filp;
  
  exit_mutex_unlock:
@@ -2415,6 +2486,12 @@ out:
         if (base)
                 fput(base);
         release_open_intent(nd);
+       if (filp == ERR_PTR(-EOPENSTALE)) {
+               if (flags & LOOKUP_RCU)
+                       filp = ERR_PTR(-ECHILD);
+               else
+                       filp = ERR_PTR(-ESTALE);
+       }
         return filp;
  
  out_filp:
diff --git a/fs/namespace.c b/fs/namespace.c

index e6081996c9a2f9d26525740545445630c4737583..1e4a5fe3d7b7f789d66839f37b1f917c1fa3e2ba 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,7 +397,7 @@ static int mnt_make_readonly(struct mount *mnt)
  {
         int ret = 0;
  
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
         /*
          * After storing MNT_WRITE_HOLD, we'll read the counters. This store
@@ -431,15 +431,15 @@ static int mnt_make_readonly(struct mount *mnt)
          */
         smp_wmb();
         mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         return ret;
  }
  
  static void __mnt_unmake_readonly(struct mount *mnt)
  {
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         mnt->mnt.mnt_flags &= ~MNT_READONLY;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  }
  
  int sb_prepare_remount_readonly(struct super_block *sb)
@@ -451,7 +451,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
         if (atomic_long_read(&sb->s_remove_count))
                 return -EBUSY;
  
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) {
                 if (!(mnt->mnt.mnt_flags & MNT_READONLY)) {
                         mnt->mnt.mnt_flags |= MNT_WRITE_HOLD;
@@ -473,7 +473,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
                 if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD)
                         mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD;
         }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  
         return err;
  }
@@ -522,14 +522,14 @@ struct vfsmount *lookup_mnt(struct path *path)
  {
         struct mount *child_mnt;
  
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
         child_mnt = __lookup_mnt(path->mnt, path->dentry, 1);
         if (child_mnt) {
                 mnt_add_count(child_mnt, 1);
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                 return &child_mnt->mnt;
         } else {
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                 return NULL;
         }
  }
@@ -714,9 +714,9 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
         mnt->mnt.mnt_sb = root->d_sb;
         mnt->mnt_mountpoint = mnt->mnt.mnt_root;
         mnt->mnt_parent = mnt;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         list_add_tail(&mnt->mnt_instance, &root->d_sb->s_mounts);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         return &mnt->mnt;
  }
  EXPORT_SYMBOL_GPL(vfs_kern_mount);
@@ -745,9 +745,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
                 mnt->mnt.mnt_root = dget(root);
                 mnt->mnt_mountpoint = mnt->mnt.mnt_root;
                 mnt->mnt_parent = mnt;
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                 list_add_tail(&mnt->mnt_instance, &sb->s_mounts);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
  
                 if (flag & CL_SLAVE) {
                         list_add(&mnt->mnt_slave, &old->mnt_slave_list);
@@ -803,35 +803,36 @@ static void mntput_no_expire(struct mount *mnt)
  {
  put_again:
  #ifdef CONFIG_SMP
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
         if (likely(atomic_read(&mnt->mnt_longterm))) {
                 mnt_add_count(mnt, -1);
-               br_read_unlock(vfsmount_lock);
+               br_read_unlock(&vfsmount_lock);
                 return;
         }
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
  
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         mnt_add_count(mnt, -1);
         if (mnt_get_count(mnt)) {
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                 return;
         }
  #else
         mnt_add_count(mnt, -1);
         if (likely(mnt_get_count(mnt)))
                 return;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
  #endif
         if (unlikely(mnt->mnt_pinned)) {
                 mnt_add_count(mnt, mnt->mnt_pinned + 1);
                 mnt->mnt_pinned = 0;
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                 acct_auto_close_mnt(&mnt->mnt);
                 goto put_again;
         }
+
         list_del(&mnt->mnt_instance);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         mntfree(mnt);
  }
  
@@ -857,21 +858,21 @@ EXPORT_SYMBOL(mntget);
  
  void mnt_pin(struct vfsmount *mnt)
  {
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         real_mount(mnt)->mnt_pinned++;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  }
  EXPORT_SYMBOL(mnt_pin);
  
  void mnt_unpin(struct vfsmount *m)
  {
         struct mount *mnt = real_mount(m);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         if (mnt->mnt_pinned) {
                 mnt_add_count(mnt, 1);
                 mnt->mnt_pinned--;
         }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  }
  EXPORT_SYMBOL(mnt_unpin);
  
@@ -988,12 +989,12 @@ int may_umount_tree(struct vfsmount *m)
         BUG_ON(!m);
  
         /* write lock needed for mnt_get_count */
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         for (p = mnt; p; p = next_mnt(p, mnt)) {
                 actual_refs += mnt_get_count(p);
                 minimum_refs += 2;
         }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  
         if (actual_refs > minimum_refs)
                 return 0;
@@ -1020,10 +1021,10 @@ int may_umount(struct vfsmount *mnt)
  {
         int ret = 1;
         down_read(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         if (propagate_mount_busy(real_mount(mnt), 2))
                 ret = 0;
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         up_read(&namespace_sem);
         return ret;
  }
@@ -1040,13 +1041,13 @@ void release_mounts(struct list_head *head)
                         struct dentry *dentry;
                         struct mount *m;
  
-                       br_write_lock(vfsmount_lock);
+                       br_write_lock(&vfsmount_lock);
                         dentry = mnt->mnt_mountpoint;
                         m = mnt->mnt_parent;
                         mnt->mnt_mountpoint = mnt->mnt.mnt_root;
                         mnt->mnt_parent = mnt;
                         m->mnt_ghosts--;
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                         dput(dentry);
                         mntput(&m->mnt);
                 }
@@ -1073,8 +1074,9 @@ void umount_tree(struct mount *mnt, int propagate, struct list_head *kill)
                 list_del_init(&p->mnt_expire);
                 list_del_init(&p->mnt_list);
                 __touch_mnt_namespace(p->mnt_ns);
+               if (p->mnt_ns)
+                       __mnt_make_shortterm(p);
                 p->mnt_ns = NULL;
-               __mnt_make_shortterm(p);
                 list_del_init(&p->mnt_child);
                 if (mnt_has_parent(p)) {
                         p->mnt_parent->mnt_ghosts++;
@@ -1112,12 +1114,12 @@ static int do_umount(struct mount *mnt, int flags)
                  * probably don't strictly need the lock here if we examined
                  * all race cases, but it's a slowpath.
                  */
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                 if (mnt_get_count(mnt) != 2) {
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                         return -EBUSY;
                 }
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
  
                 if (!xchg(&mnt->mnt_expiry_mark, 1))
                         return -EAGAIN;
@@ -1159,7 +1161,7 @@ static int do_umount(struct mount *mnt, int flags)
         }
  
         down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         event++;
  
         if (!(flags & MNT_DETACH))
@@ -1171,7 +1173,7 @@ static int do_umount(struct mount *mnt, int flags)
                         umount_tree(mnt, 1, &umount_list);
                 retval = 0;
         }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         up_write(&namespace_sem);
         release_mounts(&umount_list);
         return retval;
@@ -1286,19 +1288,19 @@ struct mount *copy_tree(struct mount *mnt, struct dentry *dentry,
                         q = clone_mnt(p, p->mnt.mnt_root, flag);
                         if (!q)
                                 goto Enomem;
-                       br_write_lock(vfsmount_lock);
+                       br_write_lock(&vfsmount_lock);
                         list_add_tail(&q->mnt_list, &res->mnt_list);
                         attach_mnt(q, &path);
-                       br_write_unlock(vfsmount_lock);
+                       br_write_unlock(&vfsmount_lock);
                 }
         }
         return res;
  Enomem:
         if (res) {
                 LIST_HEAD(umount_list);
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                 umount_tree(res, 0, &umount_list);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                 release_mounts(&umount_list);
         }
         return NULL;
@@ -1318,9 +1320,9 @@ void drop_collected_mounts(struct vfsmount *mnt)
  {
         LIST_HEAD(umount_list);
         down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         umount_tree(real_mount(mnt), 0, &umount_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         up_write(&namespace_sem);
         release_mounts(&umount_list);
  }
@@ -1448,7 +1450,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
         if (err)
                 goto out_cleanup_ids;
  
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
  
         if (IS_MNT_SHARED(dest_mnt)) {
                 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -1467,7 +1469,7 @@ static int attach_recursive_mnt(struct mount *source_mnt,
                 list_del_init(&child->mnt_hash);
                 commit_tree(child);
         }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  
         return 0;
  
@@ -1565,10 +1567,10 @@ static int do_change_type(struct path *path, int flag)
                         goto out_unlock;
         }
  
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
                 change_mnt_propagation(m, type);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  
   out_unlock:
         up_write(&namespace_sem);
@@ -1617,9 +1619,9 @@ static int do_loopback(struct path *path, char *old_name,
  
         err = graft_tree(mnt, path);
         if (err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                 umount_tree(mnt, 0, &umount_list);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
         }
  out2:
         unlock_mount(path);
@@ -1677,16 +1679,16 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
         else
                 err = do_remount_sb(sb, flags, data, 0);
         if (!err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                 mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
                 mnt->mnt.mnt_flags = mnt_flags;
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
         }
         up_write(&sb->s_umount);
         if (!err) {
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                 touch_mnt_namespace(mnt->mnt_ns);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
         }
         return err;
  }
@@ -1893,9 +1895,9 @@ fail:
         /* remove m from any expiration list it may be on */
         if (!list_empty(&mnt->mnt_expire)) {
                 down_write(&namespace_sem);
-               br_write_lock(vfsmount_lock);
+               br_write_lock(&vfsmount_lock);
                 list_del_init(&mnt->mnt_expire);
-               br_write_unlock(vfsmount_lock);
+               br_write_unlock(&vfsmount_lock);
                 up_write(&namespace_sem);
         }
         mntput(m);
@@ -1911,11 +1913,11 @@ fail:
  void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list)
  {
         down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
  
         list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list);
  
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         up_write(&namespace_sem);
  }
  EXPORT_SYMBOL(mnt_set_expiry);
@@ -1935,7 +1937,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                 return;
  
         down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
  
         /* extract from the expiration list every vfsmount that matches the
          * following criteria:
@@ -1954,7 +1956,7 @@ void mark_mounts_for_expiry(struct list_head *mounts)
                 touch_mnt_namespace(mnt->mnt_ns);
                 umount_tree(mnt, 1, &umounts);
         }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         up_write(&namespace_sem);
  
         release_mounts(&umounts);
@@ -2218,9 +2220,9 @@ void mnt_make_shortterm(struct vfsmount *m)
         struct mount *mnt = real_mount(m);
         if (atomic_add_unless(&mnt->mnt_longterm, -1, 1))
                 return;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         atomic_dec(&mnt->mnt_longterm);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  #endif
  }
  
@@ -2250,9 +2252,9 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
                 return ERR_PTR(-ENOMEM);
         }
         new_ns->root = new;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         list_add_tail(&new_ns->list, &new->mnt_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
  
         /*
          * Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -2416,9 +2418,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
  int path_is_under(struct path *path1, struct path *path2)
  {
         int res;
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
         res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
         return res;
  }
  EXPORT_SYMBOL(path_is_under);
@@ -2505,7 +2507,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
         /* make sure we can reach put_old from new_root */
         if (!is_path_reachable(real_mount(old.mnt), old.dentry, &new))
                 goto out4;
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         detach_mnt(new_mnt, &parent_path);
         detach_mnt(root_mnt, &root_parent);
         /* mount old root on put_old */
@@ -2513,7 +2515,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
         /* mount new_root on / */
         attach_mnt(new_mnt, &root_parent);
         touch_mnt_namespace(current->nsproxy->mnt_ns);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         chroot_fs_refs(&root, &new);
         error = 0;
  out4:
@@ -2576,7 +2578,7 @@ void __init mnt_init(void)
         for (u = 0; u < HASH_SIZE; u++)
                 INIT_LIST_HEAD(&mount_hashtable[u]);
  
-       br_lock_init(vfsmount_lock);
+       br_lock_init(&vfsmount_lock);
  
         err = sysfs_init();
         if (err)
@@ -2596,9 +2598,9 @@ void put_mnt_ns(struct mnt_namespace *ns)
         if (!atomic_dec_and_test(&ns->count))
                 return;
         down_write(&namespace_sem);
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         umount_tree(ns->root, 0, &umount_list);
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         up_write(&namespace_sem);
         release_mounts(&umount_list);
         kfree(ns);
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c

index 3ff5fcc1528fd21ae18a7a240ec9f2920ec30d32..122e260247f53c663550073fda567a4342b0ba63 100644 (file)
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -221,6 +221,10 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
  
         already_written = 0;
  
+       errno = file_update_time(file);
+       if (errno)
+               goto outrel;
+
         bouncebuffer = vmalloc(bufsize);
         if (!bouncebuffer) {
                 errno = -EIO;   /* -ENOMEM */
@@ -252,8 +256,6 @@ ncp_file_write(struct file *file, const char __user *buf, size_t count, loff_t *
         }
         vfree(bouncebuffer);
  
-       file_update_time(file);
-
         *ppos = pos;
  
         if (pos > i_size_read(inode)) {
diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h

index 4af803f13516c98deaf7372af2dda0499e329fe6..54cc0cdb3dcbda111e24a3a67e7953e5173dd07e 100644 (file)
--- a/fs/ncpfs/ncp_fs_sb.h
+++ b/fs/ncpfs/ncp_fs_sb.h
@@ -23,17 +23,17 @@ struct ncp_mount_data_kernel {
         unsigned long    flags;         /* NCP_MOUNT_* flags */
         unsigned int     int_flags;     /* internal flags */
  #define NCP_IMOUNT_LOGGEDIN_POSSIBLE   0x0001
-       __kernel_uid32_t mounted_uid;   /* Who may umount() this filesystem? */
+       uid_t            mounted_uid;   /* Who may umount() this filesystem? */
         struct pid      *wdog_pid;      /* Who cares for our watchdog packets? */
         unsigned int     ncp_fd;        /* The socket to the ncp port */
         unsigned int     time_out;      /* How long should I wait after
                                            sending a NCP request? */
         unsigned int     retry_count;   /* And how often should I retry? */
         unsigned char    mounted_vol[NCP_VOLNAME_LEN + 1];
-       __kernel_uid32_t uid;
-       __kernel_gid32_t gid;
-       __kernel_mode_t  file_mode;
-       __kernel_mode_t  dir_mode;
+       uid_t            uid;
+       gid_t            gid;
+       umode_t          file_mode;
+       umode_t          dir_mode;
         int              info_fd;
  };
  
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c

index eb95f5091c1aff93930e17a829a808023edc2e12..970659daa323865a113d25075d461c50c7f7dc7c 100644 (file)
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -17,6 +17,7 @@
  #include <linux/kthread.h>
  #include <linux/sunrpc/svcauth_gss.h>
  #include <linux/sunrpc/bc_xprt.h>
+#include <linux/nsproxy.h>
  
  #include <net/inet_sock.h>
  
@@ -253,6 +254,7 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
         char svc_name[12];
         int ret = 0;
         int minorversion_setup;
+       struct net *net = current->nsproxy->net_ns;
  
         mutex_lock(&nfs_callback_mutex);
         if (cb_info->users++ || cb_info->task != NULL) {
@@ -265,6 +267,12 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
                 goto out_err;
         }
  
+       ret = svc_bind(serv, net);
+       if (ret < 0) {
+               printk(KERN_WARNING "NFS: bind callback service failed\n");
+               goto out_err;
+       }
+
         minorversion_setup =  nfs_minorversion_callback_svc_setup(minorversion,
                                         serv, xprt, &rqstp, &callback_svc);
         if (!minorversion_setup) {
@@ -306,6 +314,8 @@ out_err:
         dprintk("NFS: Couldn't create callback socket or server thread; "
                 "err = %d\n", ret);
         cb_info->users--;
+       if (serv)
+               svc_shutdown_net(serv, net);
         goto out;
  }
  
@@ -320,6 +330,7 @@ void nfs_callback_down(int minorversion)
         cb_info->users--;
         if (cb_info->users == 0 && cb_info->task != NULL) {
                 kthread_stop(cb_info->task);
+               svc_shutdown_net(cb_info->serv, current->nsproxy->net_ns);
                 svc_exit_thread(cb_info->rqst);
                 cb_info->serv = NULL;
                 cb_info->rqst = NULL;
@@ -332,7 +343,7 @@ void nfs_callback_down(int minorversion)
  int
  check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp)
  {
-       char *p = svc_gss_principal(rqstp);
+       char *p = rqstp->rq_cred.cr_principal;
  
         if (rqstp->rq_authop->flavour != RPC_AUTH_GSS)
                 return 1;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c

index 0989a2099688a377279d76f4f8c56dbc91070027..f430057ff3b397c2fe1f523bf5fcea4135276f8c 100644 (file)
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1354,10 +1354,10 @@ out:
  }
  
  #ifdef CONFIG_NFS_V4
-static int nfs_open_revalidate(struct dentry *, struct nameidata *);
+static int nfs4_lookup_revalidate(struct dentry *, struct nameidata *);
  
  const struct dentry_operations nfs4_dentry_operations = {
-       .d_revalidate   = nfs_open_revalidate,
+       .d_revalidate   = nfs4_lookup_revalidate,
         .d_delete       = nfs_dentry_delete,
         .d_iput         = nfs_dentry_iput,
         .d_automount    = nfs_d_automount,
@@ -1519,13 +1519,11 @@ no_open:
         return nfs_lookup(dir, dentry, nd);
  }
  
-static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
+static int nfs4_lookup_revalidate(struct dentry *dentry, struct nameidata *nd)
  {
         struct dentry *parent = NULL;
         struct inode *inode;
         struct inode *dir;
-       struct nfs_open_context *ctx;
-       struct iattr attr;
         int openflags, ret = 0;
  
         if (nd->flags & LOOKUP_RCU)
@@ -1554,57 +1552,13 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
         /* We cannot do exclusive creation on a positive dentry */
         if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
                 goto no_open_dput;
-       /* We can't create new files here */
-       openflags &= ~(O_CREAT|O_EXCL);
-
-       ctx = create_nfs_open_context(dentry, openflags);
-       ret = PTR_ERR(ctx);
-       if (IS_ERR(ctx))
-               goto out;
  
-       attr.ia_valid = ATTR_OPEN;
-       if (openflags & O_TRUNC) {
-               attr.ia_valid |= ATTR_SIZE;
-               attr.ia_size = 0;
-               nfs_wb_all(inode);
-       }
-
-       /*
-        * Note: we're not holding inode->i_mutex and so may be racing with
-        * operations that change the directory. We therefore save the
-        * change attribute *before* we do the RPC call.
-        */
-       inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
-       if (IS_ERR(inode)) {
-               ret = PTR_ERR(inode);
-               switch (ret) {
-               case -EPERM:
-               case -EACCES:
-               case -EDQUOT:
-               case -ENOSPC:
-               case -EROFS:
-                       goto out_put_ctx;
-               default:
-                       goto out_drop;
-               }
-       }
-       iput(inode);
-       if (inode != dentry->d_inode)
-               goto out_drop;
+       /* Let f_op->open() actually open (and revalidate) the file */
+       ret = 1;
  
-       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
-       ret = nfs_intent_set_file(nd, ctx);
-       if (ret >= 0)
-               ret = 1;
  out:
         dput(parent);
         return ret;
-out_drop:
-       d_drop(dentry);
-       ret = 0;
-out_put_ctx:
-       put_nfs_open_context(ctx);
-       goto out;
  
  no_open_dput:
         dput(parent);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c

index 56311ca5f9f8183d3aa8c3aa8c9922db05ee32a3..a6708e6b438dd55f2924e5bb78c809c1575a97a9 100644 (file)
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -879,12 +879,81 @@ const struct file_operations nfs_file_operations = {
  static int
  nfs4_file_open(struct inode *inode, struct file *filp)
  {
+       struct nfs_open_context *ctx;
+       struct dentry *dentry = filp->f_path.dentry;
+       struct dentry *parent = NULL;
+       struct inode *dir;
+       unsigned openflags = filp->f_flags;
+       struct iattr attr;
+       int err;
+
+       BUG_ON(inode != dentry->d_inode);
         /*
-        * NFSv4 opens are handled in d_lookup and d_revalidate. If we get to
-        * this point, then something is very wrong
+        * If no cached dentry exists or if it's negative, NFSv4 handled the
+        * opens in ->lookup() or ->create().
+        *
+        * We only get this far for a cached positive dentry.  We skipped
+        * revalidation, so handle it here by dropping the dentry and returning
+        * -EOPENSTALE.  The VFS will retry the lookup/create/open.
          */
-       dprintk("NFS: %s called! inode=%p filp=%p\n", __func__, inode, filp);
-       return -ENOTDIR;
+
+       dprintk("NFS: open file(%s/%s)\n",
+               dentry->d_parent->d_name.name,
+               dentry->d_name.name);
+
+       if ((openflags & O_ACCMODE) == 3)
+               openflags--;
+
+       /* We can't create new files here */
+       openflags &= ~(O_CREAT|O_EXCL);
+
+       parent = dget_parent(dentry);
+       dir = parent->d_inode;
+
+       ctx = alloc_nfs_open_context(filp->f_path.dentry, filp->f_mode);
+       err = PTR_ERR(ctx);
+       if (IS_ERR(ctx))
+               goto out;
+
+       attr.ia_valid = ATTR_OPEN;
+       if (openflags & O_TRUNC) {
+               attr.ia_valid |= ATTR_SIZE;
+               attr.ia_size = 0;
+               nfs_wb_all(inode);
+       }
+
+       inode = NFS_PROTO(dir)->open_context(dir, ctx, openflags, &attr);
+       if (IS_ERR(inode)) {
+               err = PTR_ERR(inode);
+               switch (err) {
+               case -EPERM:
+               case -EACCES:
+               case -EDQUOT:
+               case -ENOSPC:
+               case -EROFS:
+                       goto out_put_ctx;
+               default:
+                       goto out_drop;
+               }
+       }
+       iput(inode);
+       if (inode != dentry->d_inode)
+               goto out_drop;
+
+       nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
+       nfs_file_set_open_context(filp, ctx);
+       err = 0;
+
+out_put_ctx:
+       put_nfs_open_context(ctx);
+out:
+       dput(parent);
+       return err;
+
+out_drop:
+       d_drop(dentry);
+       err = -EOPENSTALE;
+       goto out_put_ctx;
  }
  
  const struct file_operations nfs4_file_operations = {
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c

index 204438cc914ea522b83907aaf618bb0dbcfa4068..34a10d78b839f4c73b3d851e19820bc712129f36 100644 (file)
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -11,7 +11,7 @@ int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp)
         struct exp_flavor_info *end = exp->ex_flavors + exp->ex_nflavors;
  
         for (f = exp->ex_flavors; f < end; f++) {
-               if (f->pseudoflavor == rqstp->rq_flavor)
+               if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
                         return f->flags;
         }
         return exp->ex_flags;
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c

index dcb52b8845194db72aa37bb05f5f4ea2d49b9691..ba233499b9a5fc1b374bc7d79ad8f636f01135b0 100644 (file)
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -706,7 +706,7 @@ static struct cache_head *svc_export_alloc(void)
                 return NULL;
  }
  
-struct cache_detail svc_export_cache_template = {
+static struct cache_detail svc_export_cache_template = {
         .owner          = THIS_MODULE,
         .hash_size      = EXPORT_HASHMAX,
         .name           = "nfsd.export",
@@ -904,13 +904,13 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
                 return 0;
         /* ip-address based client; check sec= export option: */
         for (f = exp->ex_flavors; f < end; f++) {
-               if (f->pseudoflavor == rqstp->rq_flavor)
+               if (f->pseudoflavor == rqstp->rq_cred.cr_flavor)
                         return 0;
         }
         /* defaults in absence of sec= options: */
         if (exp->ex_nflavors == 0) {
-               if (rqstp->rq_flavor == RPC_AUTH_NULL ||
-                   rqstp->rq_flavor == RPC_AUTH_UNIX)
+               if (rqstp->rq_cred.cr_flavor == RPC_AUTH_NULL ||
+                   rqstp->rq_cred.cr_flavor == RPC_AUTH_UNIX)
                         return 0;
         }
         return nfserr_wrongsec;
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c

index 9559ce468732e7c00ae40cd4fc3a379a525ecef7..e6c38159622fe6bc337f3d24ada7db683838ceac 100644 (file)
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -58,6 +58,7 @@ static int nfsd_inject_set(void *op_ptr, u64 val)
  
  static int nfsd_inject_get(void *data, u64 *val)
  {
+       *val = 0;
         return 0;
  }
  
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c

index c8e9f637153ab3e44ba293f7097e7b32e77d4e54..a5fd6b982f277ce648bbd528947964ea2ef63c73 100644 (file)
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -650,9 +650,10 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
         struct rpc_clnt *client;
  
         if (clp->cl_minorversion == 0) {
-               if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
+               if (!clp->cl_cred.cr_principal &&
+                               (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
                         return -EINVAL;
-               args.client_name = clp->cl_principal;
+               args.client_name = clp->cl_cred.cr_principal;
                 args.prognumber = conn->cb_prog,
                 args.protocol = XPRT_TRANSPORT_TCP;
                 args.authflavor = clp->cl_flavor;
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c

index 286a7f8f2024fa9667d16a0c4d1444fb0de5be98..dae36f1dee95e68defce943bedf01efc46d61a54 100644 (file)
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -605,7 +605,7 @@ numeric_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namel
  static __be32
  do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, uid_t *id)
  {
-       if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+       if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
                 if (numeric_name_to_id(rqstp, type, name, namelen, id))
                         return 0;
                 /*
@@ -618,7 +618,7 @@ do_name_to_id(struct svc_rqst *rqstp, int type, const char *name, u32 namelen, u
  static int
  do_id_to_name(struct svc_rqst *rqstp, int type, uid_t id, char *name)
  {
-       if (nfs4_disable_idmapping && rqstp->rq_flavor < RPC_AUTH_GSS)
+       if (nfs4_disable_idmapping && rqstp->rq_cred.cr_flavor < RPC_AUTH_GSS)
                 return sprintf(name, "%u", id);
         return idmap_id_to_name(rqstp, type, id, name);
  }
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c

index ed3f9206a0ee87c914f133492f1f6011775bdef8..5ff0b7b9fc08f22f39cc1f2d83062baceb773bdc 100644 (file)
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -570,7 +570,7 @@ static ssize_t
  cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
  {
         struct cld_upcall *tmp, *cup;
-       struct cld_msg *cmsg = (struct cld_msg *)src;
+       struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
         uint32_t xid;
         struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
                                                 nfsd_net_id);
@@ -1029,7 +1029,7 @@ rpc_pipefs_event(struct notifier_block *nb, unsigned long event, void *ptr)
         return ret;
  }
  
-struct notifier_block nfsd4_cld_block = {
+static struct notifier_block nfsd4_cld_block = {
         .notifier_call = rpc_pipefs_event,
  };
  
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c

index 03f82c0bc35d725b7b460a9ef0ef93fd80409e38..8fdc9ec5c5d359f8defb2766e710eb35fc08c3b0 100644 (file)
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -42,6 +42,7 @@
  #include <linux/sunrpc/clnt.h>
  #include "xdr4.h"
  #include "vfs.h"
+#include "current_stateid.h"
  
  #define NFSDDBG_FACILITY                NFSDDBG_PROC
  
@@ -447,37 +448,69 @@ static struct list_head close_lru;
   *
   * which we should reject.
   */
-static void
-set_access(unsigned int *access, unsigned long bmap) {
+static unsigned int
+bmap_to_share_mode(unsigned long bmap) {
         int i;
+       unsigned int access = 0;
  
-       *access = 0;
         for (i = 1; i < 4; i++) {
                 if (test_bit(i, &bmap))
-                       *access |= i;
-       }
-}
-
-static void
-set_deny(unsigned int *deny, unsigned long bmap) {
-       int i;
-
-       *deny = 0;
-       for (i = 0; i < 4; i++) {
-               if (test_bit(i, &bmap))
-                       *deny |= i ;
+                       access |= i;
         }
+       return access;
  }
  
-static int
+static bool
  test_share(struct nfs4_ol_stateid *stp, struct nfsd4_open *open) {
         unsigned int access, deny;
  
-       set_access(&access, stp->st_access_bmap);
-       set_deny(&deny, stp->st_deny_bmap);
+       access = bmap_to_share_mode(stp->st_access_bmap);
+       deny = bmap_to_share_mode(stp->st_deny_bmap);
         if ((access & open->op_share_deny) || (deny & open->op_share_access))
-               return 0;
-       return 1;
+               return false;
+       return true;
+}
+
+/* set share access for a given stateid */
+static inline void
+set_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __set_bit(access, &stp->st_access_bmap);
+}
+
+/* clear share access for a given stateid */
+static inline void
+clear_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __clear_bit(access, &stp->st_access_bmap);
+}
+
+/* test whether a given stateid has access */
+static inline bool
+test_access(u32 access, struct nfs4_ol_stateid *stp)
+{
+       return test_bit(access, &stp->st_access_bmap);
+}
+
+/* set share deny for a given stateid */
+static inline void
+set_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __set_bit(access, &stp->st_deny_bmap);
+}
+
+/* clear share deny for a given stateid */
+static inline void
+clear_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+       __clear_bit(access, &stp->st_deny_bmap);
+}
+
+/* test whether a given stateid is denying specific access */
+static inline bool
+test_deny(u32 access, struct nfs4_ol_stateid *stp)
+{
+       return test_bit(access, &stp->st_deny_bmap);
  }
  
  static int nfs4_access_to_omode(u32 access)
@@ -493,6 +526,20 @@ static int nfs4_access_to_omode(u32 access)
         BUG();
  }
  
+/* release all access and file references for a given stateid */
+static void
+release_all_access(struct nfs4_ol_stateid *stp)
+{
+       int i;
+
+       for (i = 1; i < 4; i++) {
+               if (test_access(i, stp))
+                       nfs4_file_put_access(stp->st_file,
+                                            nfs4_access_to_omode(i));
+               clear_access(i, stp);
+       }
+}
+
  static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
  {
         list_del(&stp->st_perfile);
@@ -501,16 +548,7 @@ static void unhash_generic_stateid(struct nfs4_ol_stateid *stp)
  
  static void close_generic_stateid(struct nfs4_ol_stateid *stp)
  {
-       int i;
-
-       if (stp->st_access_bmap) {
-               for (i = 1; i < 4; i++) {
-                       if (test_bit(i, &stp->st_access_bmap))
-                               nfs4_file_put_access(stp->st_file,
-                                               nfs4_access_to_omode(i));
-                       __clear_bit(i, &stp->st_access_bmap);
-               }
-       }
+       release_all_access(stp);
         put_nfs4_file(stp->st_file);
         stp->st_file = NULL;
  }
@@ -885,7 +923,7 @@ static struct nfsd4_session *alloc_init_session(struct svc_rqst *rqstp, struct n
         struct nfsd4_session *new;
         struct nfsd4_channel_attrs *fchan = &cses->fore_channel;
         int numslots, slotsize;
-       int status;
+       __be32 status;
         int idx;
  
         /*
@@ -984,7 +1022,8 @@ static inline void
  renew_client_locked(struct nfs4_client *clp)
  {
         if (is_client_expired(clp)) {
-               dprintk("%s: client (clientid %08x/%08x) already expired\n",
+               WARN_ON(1);
+               printk("%s: client (clientid %08x/%08x) already expired\n",
                         __func__,
                         clp->cl_clientid.cl_boot,
                         clp->cl_clientid.cl_id);
@@ -1049,9 +1088,7 @@ free_client(struct nfs4_client *clp)
                 list_del(&ses->se_perclnt);
                 nfsd4_put_session_locked(ses);
         }
-       if (clp->cl_cred.cr_group_info)
-               put_group_info(clp->cl_cred.cr_group_info);
-       kfree(clp->cl_principal);
+       free_svc_cred(&clp->cl_cred);
         kfree(clp->cl_name.data);
         kfree(clp);
  }
@@ -1132,12 +1169,21 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
         target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
  }
  
-static void copy_cred(struct svc_cred *target, struct svc_cred *source)
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
  {
+       if (source->cr_principal) {
+               target->cr_principal =
+                               kstrdup(source->cr_principal, GFP_KERNEL);
+               if (target->cr_principal == NULL)
+                       return -ENOMEM;
+       } else
+               target->cr_principal = NULL;
+       target->cr_flavor = source->cr_flavor;
         target->cr_uid = source->cr_uid;
         target->cr_gid = source->cr_gid;
         target->cr_group_info = source->cr_group_info;
         get_group_info(target->cr_group_info);
+       return 0;
  }
  
  static int same_name(const char *n1, const char *n2)
@@ -1157,11 +1203,31 @@ same_clid(clientid_t *cl1, clientid_t *cl2)
         return (cl1->cl_boot == cl2->cl_boot) && (cl1->cl_id == cl2->cl_id);
  }
  
-/* XXX what about NGROUP */
+static bool groups_equal(struct group_info *g1, struct group_info *g2)
+{
+       int i;
+
+       if (g1->ngroups != g2->ngroups)
+               return false;
+       for (i=0; i<g1->ngroups; i++)
+               if (GROUP_AT(g1, i) != GROUP_AT(g2, i))
+                       return false;
+       return true;
+}
+
  static int
  same_creds(struct svc_cred *cr1, struct svc_cred *cr2)
  {
-       return cr1->cr_uid == cr2->cr_uid;
+       if ((cr1->cr_flavor != cr2->cr_flavor)
+               || (cr1->cr_uid != cr2->cr_uid)
+               || (cr1->cr_gid != cr2->cr_gid)
+               || !groups_equal(cr1->cr_group_info, cr2->cr_group_info))
+               return false;
+       if (cr1->cr_principal == cr2->cr_principal)
+               return true;
+       if (!cr1->cr_principal || !cr2->cr_principal)
+               return false;
+       return 0 == strcmp(cr1->cr_principal, cr1->cr_principal);
  }
  
  static void gen_clid(struct nfs4_client *clp)
@@ -1204,25 +1270,20 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
  {
         struct nfs4_client *clp;
         struct sockaddr *sa = svc_addr(rqstp);
-       char *princ;
+       int ret;
  
         clp = alloc_client(name);
         if (clp == NULL)
                 return NULL;
  
         INIT_LIST_HEAD(&clp->cl_sessions);
-
-       princ = svc_gss_principal(rqstp);
-       if (princ) {
-               clp->cl_principal = kstrdup(princ, GFP_KERNEL);
-               if (clp->cl_principal == NULL) {
-                       spin_lock(&client_lock);
-                       free_client(clp);
-                       spin_unlock(&client_lock);
-                       return NULL;
-               }
+       ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
+       if (ret) {
+               spin_lock(&client_lock);
+               free_client(clp);
+               spin_unlock(&client_lock);
+               return NULL;
         }
-
         idr_init(&clp->cl_stateids);
         memcpy(clp->cl_recdir, recdir, HEXDIR_LEN);
         atomic_set(&clp->cl_refcount, 0);
@@ -1240,8 +1301,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir,
         rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
         copy_verf(clp, verf);
         rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
-       clp->cl_flavor = rqstp->rq_flavor;
-       copy_cred(&clp->cl_cred, &rqstp->rq_cred);
         gen_confirm(clp);
         clp->cl_cb_session = NULL;
         return clp;
@@ -1470,18 +1529,32 @@ nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
         clid->flags = new->cl_exchange_flags;
  }
  
+static bool client_has_state(struct nfs4_client *clp)
+{
+       /*
+        * Note clp->cl_openowners check isn't quite right: there's no
+        * need to count owners without stateid's.
+        *
+        * Also note we should probably be using this in 4.0 case too.
+        */
+       return !list_empty(&clp->cl_openowners)
+               || !list_empty(&clp->cl_delegations)
+               || !list_empty(&clp->cl_sessions);
+}
+
  __be32
  nfsd4_exchange_id(struct svc_rqst *rqstp,
                   struct nfsd4_compound_state *cstate,
                   struct nfsd4_exchange_id *exid)
  {
         struct nfs4_client *unconf, *conf, *new;
-       int status;
+       __be32 status;
         unsigned int            strhashval;
         char                    dname[HEXDIR_LEN];
         char                    addr_str[INET6_ADDRSTRLEN];
         nfs4_verifier           verf = exid->verifier;
         struct sockaddr         *sa = svc_addr(rqstp);
+       bool    update = exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A;
  
         rpc_ntop(sa, addr_str, sizeof(addr_str));
         dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
@@ -1507,71 +1580,63 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
         status = nfs4_make_rec_clidname(dname, &exid->clname);
  
         if (status)
-               goto error;
+               return status;
  
         strhashval = clientstr_hashval(dname);
  
+       /* Cases below refer to rfc 5661 section 18.35.4: */
         nfs4_lock_state();
-       status = nfs_ok;
-
         conf = find_confirmed_client_by_str(dname, strhashval);
         if (conf) {
-               if (!clp_used_exchangeid(conf)) {
-                       status = nfserr_clid_inuse; /* XXX: ? */
-                       goto out;
-               }
-               if (!same_verf(&verf, &conf->cl_verifier)) {
-                       /* 18.35.4 case 8 */
-                       if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+               bool creds_match = same_creds(&conf->cl_cred, &rqstp->rq_cred);
+               bool verfs_match = same_verf(&verf, &conf->cl_verifier);
+
+               if (update) {
+                       if (!clp_used_exchangeid(conf)) { /* buggy client */
+                               status = nfserr_inval;
+                               goto out;
+                       }
+                       if (!creds_match) { /* case 9 */
+                               status = nfserr_perm;
+                               goto out;
+                       }
+                       if (!verfs_match) { /* case 8 */
                                 status = nfserr_not_same;
                                 goto out;
                         }
-                       /* Client reboot: destroy old state */
-                       expire_client(conf);
-                       goto out_new;
+                       /* case 6 */
+                       exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
+                       new = conf;
+                       goto out_copy;
                 }
-               if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
-                       /* 18.35.4 case 9 */
-                       if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
-                               status = nfserr_perm;
+               if (!creds_match) { /* case 3 */
+                       if (client_has_state(conf)) {
+                               status = nfserr_clid_inuse;
                                 goto out;
                         }
                         expire_client(conf);
                         goto out_new;
                 }
-               /*
-                * Set bit when the owner id and verifier map to an already
-                * confirmed client id (18.35.3).
-                */
-               exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
-
-               /*
-                * Falling into 18.35.4 case 2, possible router replay.
-                * Leave confirmed record intact and return same result.
-                */
-               copy_verf(conf, &verf);
-               new = conf;
-               goto out_copy;
+               if (verfs_match) { /* case 2 */
+                       conf->cl_exchange_flags |= EXCHGID4_FLAG_CONFIRMED_R;
+                       new = conf;
+                       goto out_copy;
+               }
+               /* case 5, client reboot */
+               goto out_new;
         }
  
-       /* 18.35.4 case 7 */
-       if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
+       if (update) { /* case 7 */
                 status = nfserr_noent;
                 goto out;
         }
  
         unconf  = find_unconfirmed_client_by_str(dname, strhashval);
-       if (unconf) {
-               /*
-                * Possible retry or client restart.  Per 18.35.4 case 4,
-                * a new unconfirmed record should be generated regardless
-                * of whether any properties have changed.
-                */
+       if (unconf) /* case 4, possible retry or client restart */
                 expire_client(unconf);
-       }
  
+       /* case 1 (normal case) */
  out_new:
-       /* Normal case */
         new = create_client(exid->clname, dname, rqstp, &verf);
         if (new == NULL) {
                 status = nfserr_jukebox;
@@ -1584,7 +1649,7 @@ out_copy:
         exid->clientid.cl_boot = new->cl_clientid.cl_boot;
         exid->clientid.cl_id = new->cl_clientid.cl_id;
  
-       exid->seqid = 1;
+       exid->seqid = new->cl_cs_slot.sl_seqid + 1;
         nfsd4_set_ex_flags(new, exid);
  
         dprintk("nfsd4_exchange_id seqid %d flags %x\n",
@@ -1593,12 +1658,10 @@ out_copy:
  
  out:
         nfs4_unlock_state();
-error:
-       dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
         return status;
  }
  
-static int
+static __be32
  check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
  {
         dprintk("%s enter. seqid %d slot_seqid %d\n", __func__, seqid,
@@ -1626,7 +1689,7 @@ check_slot_seqid(u32 seqid, u32 slot_seqid, int slot_inuse)
   */
  static void
  nfsd4_cache_create_session(struct nfsd4_create_session *cr_ses,
-                          struct nfsd4_clid_slot *slot, int nfserr)
+                          struct nfsd4_clid_slot *slot, __be32 nfserr)
  {
         slot->sl_status = nfserr;
         memcpy(&slot->sl_cr_ses, cr_ses, sizeof(*cr_ses));
@@ -1657,7 +1720,7 @@ nfsd4_replay_create_session(struct nfsd4_create_session *cr_ses,
                                 /* seqid, slotID, slotID, slotID, status */ \
                         5 ) * sizeof(__be32))
  
-static __be32 check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
+static bool check_forechannel_attrs(struct nfsd4_channel_attrs fchannel)
  {
         return fchannel.maxreq_sz < NFSD_MIN_REQ_HDR_SEQ_SZ
                 || fchannel.maxresp_sz < NFSD_MIN_RESP_HDR_SEQ_SZ;
@@ -1673,7 +1736,7 @@ nfsd4_create_session(struct svc_rqst *rqstp,
         struct nfsd4_session *new;
         struct nfsd4_clid_slot *cs_slot = NULL;
         bool confirm_me = false;
-       int status = 0;
+       __be32 status = 0;
  
         if (cr_ses->flags & ~SESSION4_FLAG_MASK_A)
                 return nfserr_inval;
@@ -1686,16 +1749,10 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                 cs_slot = &conf->cl_cs_slot;
                 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                 if (status == nfserr_replay_cache) {
-                       dprintk("Got a create_session replay! seqid= %d\n",
-                               cs_slot->sl_seqid);
-                       /* Return the cached reply status */
                         status = nfsd4_replay_create_session(cr_ses, cs_slot);
                         goto out;
                 } else if (cr_ses->seqid != cs_slot->sl_seqid + 1) {
                         status = nfserr_seq_misordered;
-                       dprintk("Sequence misordered!\n");
-                       dprintk("Expected seqid= %d but got seqid= %d\n",
-                               cs_slot->sl_seqid, cr_ses->seqid);
                         goto out;
                 }
         } else if (unconf) {
@@ -1704,7 +1761,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                         status = nfserr_clid_inuse;
                         goto out;
                 }
-
                 cs_slot = &unconf->cl_cs_slot;
                 status = check_slot_seqid(cr_ses->seqid, cs_slot->sl_seqid, 0);
                 if (status) {
@@ -1712,7 +1768,6 @@ nfsd4_create_session(struct svc_rqst *rqstp,
                         status = nfserr_seq_misordered;
                         goto out;
                 }
-
                 confirm_me = true;
                 conf = unconf;
         } else {
@@ -1749,8 +1804,14 @@ nfsd4_create_session(struct svc_rqst *rqstp,
  
         /* cache solo and embedded create sessions under the state lock */
         nfsd4_cache_create_session(cr_ses, cs_slot, status);
-       if (confirm_me)
+       if (confirm_me) {
+               unsigned int hash = clientstr_hashval(unconf->cl_recdir);
+               struct nfs4_client *old =
+                       find_confirmed_client_by_str(conf->cl_recdir, hash);
+               if (old)
+                       expire_client(old);
                 move_to_confirmed(conf);
+       }
  out:
         nfs4_unlock_state();
         dprintk("%s returns %d\n", __func__, ntohl(status));
@@ -1818,7 +1879,7 @@ nfsd4_destroy_session(struct svc_rqst *r,
                       struct nfsd4_destroy_session *sessionid)
  {
         struct nfsd4_session *ses;
-       u32 status = nfserr_badsession;
+       __be32 status = nfserr_badsession;
  
         /* Notes:
          * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
@@ -1914,7 +1975,7 @@ nfsd4_sequence(struct svc_rqst *rqstp,
         struct nfsd4_session *session;
         struct nfsd4_slot *slot;
         struct nfsd4_conn *conn;
-       int status;
+       __be32 status;
  
         if (resp->opcnt != 1)
                 return nfserr_sequence_pos;
@@ -2008,18 +2069,11 @@ out:
         return status;
  }
  
-static inline bool has_resources(struct nfs4_client *clp)
-{
-       return !list_empty(&clp->cl_openowners)
-               || !list_empty(&clp->cl_delegations)
-               || !list_empty(&clp->cl_sessions);
-}
-
  __be32
  nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_destroy_clientid *dc)
  {
         struct nfs4_client *conf, *unconf, *clp;
-       int status = 0;
+       __be32 status = 0;
  
         nfs4_lock_state();
         unconf = find_unconfirmed_client(&dc->clientid);
@@ -2028,7 +2082,7 @@ nfsd4_destroy_clientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *csta
         if (conf) {
                 clp = conf;
  
-               if (!is_client_expired(conf) && has_resources(conf)) {
+               if (!is_client_expired(conf) && client_has_state(conf)) {
                         status = nfserr_clientid_busy;
                         goto out;
                 }
@@ -2055,7 +2109,7 @@ out:
  __be32
  nfsd4_reclaim_complete(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_reclaim_complete *rc)
  {
-       int status = 0;
+       __be32 status = 0;
  
         if (rc->rca_one_fs) {
                 if (!cstate->current_fh.fh_dentry)
@@ -2106,17 +2160,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
         if (status)
                 return status;
  
-       /* 
-        * XXX The Duplicate Request Cache (DRC) has been checked (??)
-        * We get here on a DRC miss.
-        */
-
         strhashval = clientstr_hashval(dname);
  
+       /* Cases below refer to rfc 3530 section 14.2.33: */
         nfs4_lock_state();
         conf = find_confirmed_client_by_str(dname, strhashval);
         if (conf) {
-               /* RFC 3530 14.2.33 CASE 0: */
+               /* case 0: */
                 status = nfserr_clid_inuse;
                 if (clp_used_exchangeid(conf))
                         goto out;
@@ -2129,63 +2179,18 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
                         goto out;
                 }
         }
-       /*
-        * section 14.2.33 of RFC 3530 (under the heading "IMPLEMENTATION")
-        * has a description of SETCLIENTID request processing consisting
-        * of 5 bullet points, labeled as CASE0 - CASE4 below.
-        */
         unconf = find_unconfirmed_client_by_str(dname, strhashval);
+       if (unconf)
+               expire_client(unconf);
         status = nfserr_jukebox;
-       if (!conf) {
-               /*
-                * RFC 3530 14.2.33 CASE 4:
-                * placed first, because it is the normal case
-                */
-               if (unconf)
-                       expire_client(unconf);
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
-               gen_clid(new);
-       } else if (same_verf(&conf->cl_verifier, &clverifier)) {
-               /*
-                * RFC 3530 14.2.33 CASE 1:
-                * probable callback update
-                */
-               if (unconf) {
-                       /* Note this is removing unconfirmed {*x***},
-                        * which is stronger than RFC recommended {vxc**}.
-                        * This has the advantage that there is at most
-                        * one {*x***} in either list at any time.
-                        */
-                       expire_client(unconf);
-               }
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
+       new = create_client(clname, dname, rqstp, &clverifier);
+       if (new == NULL)
+               goto out;
+       if (conf && same_verf(&conf->cl_verifier, &clverifier))
+               /* case 1: probable callback update */
                 copy_clid(new, conf);
-       } else if (!unconf) {
-               /*
-                * RFC 3530 14.2.33 CASE 2:
-                * probable client reboot; state will be removed if
-                * confirmed.
-                */
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
-               gen_clid(new);
-       } else {
-               /*
-                * RFC 3530 14.2.33 CASE 3:
-                * probable client reboot; state will be removed if
-                * confirmed.
-                */
-               expire_client(unconf);
-               new = create_client(clname, dname, rqstp, &clverifier);
-               if (new == NULL)
-                       goto out;
+       else /* case 4 (new client) or cases 2, 3 (client reboot): */
                 gen_clid(new);
-       }
         /*
          * XXX: we should probably set this at creation time, and check
          * for consistent minorversion use throughout:
@@ -2203,17 +2208,11 @@ out:
  }
  
  
-/*
- * Section 14.2.34 of RFC 3530 (under the heading "IMPLEMENTATION") has
- * a description of SETCLIENTID_CONFIRM request processing consisting of 4
- * bullets, labeled as CASE1 - CASE4 below.
- */
  __be32
  nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
                          struct nfsd4_compound_state *cstate,
                          struct nfsd4_setclientid_confirm *setclientid_confirm)
  {
-       struct sockaddr *sa = svc_addr(rqstp);
         struct nfs4_client *conf, *unconf;
         nfs4_verifier confirm = setclientid_confirm->sc_confirm; 
         clientid_t * clid = &setclientid_confirm->sc_clientid;
@@ -2221,84 +2220,44 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
  
         if (STALE_CLIENTID(clid))
                 return nfserr_stale_clientid;
-       /* 
-        * XXX The Duplicate Request Cache (DRC) has been checked (??)
-        * We get here on a DRC miss.
-        */
-
         nfs4_lock_state();
  
         conf = find_confirmed_client(clid);
         unconf = find_unconfirmed_client(clid);
-
-       status = nfserr_clid_inuse;
-       if (conf && !rpc_cmp_addr((struct sockaddr *) &conf->cl_addr, sa))
-               goto out;
-       if (unconf && !rpc_cmp_addr((struct sockaddr *) &unconf->cl_addr, sa))
-               goto out;
-
         /*
-        * section 14.2.34 of RFC 3530 has a description of
-        * SETCLIENTID_CONFIRM request processing consisting
-        * of 4 bullet points, labeled as CASE1 - CASE4 below.
+        * We try hard to give out unique clientid's, so if we get an
+        * attempt to confirm the same clientid with a different cred,
+        * there's a bug somewhere.  Let's charitably assume it's our
+        * bug.
          */
-       if (conf && unconf && same_verf(&confirm, &unconf->cl_confirm)) {
-               /*
-                * RFC 3530 14.2.34 CASE 1:
-                * callback update
-                */
-               if (!same_creds(&conf->cl_cred, &unconf->cl_cred))
-                       status = nfserr_clid_inuse;
-               else {
-                       nfsd4_change_callback(conf, &unconf->cl_cb_conn);
-                       nfsd4_probe_callback(conf);
-                       expire_client(unconf);
+       status = nfserr_serverfault;
+       if (unconf && !same_creds(&unconf->cl_cred, &rqstp->rq_cred))
+               goto out;
+       if (conf && !same_creds(&conf->cl_cred, &rqstp->rq_cred))
+               goto out;
+       /* cases below refer to rfc 3530 section 14.2.34: */
+       if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
+               if (conf && !unconf) /* case 2: probable retransmit */
                         status = nfs_ok;
+               else /* case 4: client hasn't noticed we rebooted yet? */
+                       status = nfserr_stale_clientid;
+               goto out;
+       }
+       status = nfs_ok;
+       if (conf) { /* case 1: callback update */
+               nfsd4_change_callback(conf, &unconf->cl_cb_conn);
+               nfsd4_probe_callback(conf);
+               expire_client(unconf);
+       } else { /* case 3: normal case; new or rebooted client */
+               unsigned int hash = clientstr_hashval(unconf->cl_recdir);
  
+               conf = find_confirmed_client_by_str(unconf->cl_recdir, hash);
+               if (conf) {
+                       nfsd4_client_record_remove(conf);
+                       expire_client(conf);
                 }
-       } else if (conf && !unconf) {
-               /*
-                * RFC 3530 14.2.34 CASE 2:
-                * probable retransmitted request; play it safe and
-                * do nothing.
-                */
-               if (!same_creds(&conf->cl_cred, &rqstp->rq_cred))
-                       status = nfserr_clid_inuse;
-               else
-                       status = nfs_ok;
-       } else if (!conf && unconf
-                       && same_verf(&unconf->cl_confirm, &confirm)) {
-               /*
-                * RFC 3530 14.2.34 CASE 3:
-                * Normal case; new or rebooted client:
-                */
-               if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred)) {
-                       status = nfserr_clid_inuse;
-               } else {
-                       unsigned int hash =
-                               clientstr_hashval(unconf->cl_recdir);
-                       conf = find_confirmed_client_by_str(unconf->cl_recdir,
-                                                           hash);
-                       if (conf) {
-                               nfsd4_client_record_remove(conf);
-                               expire_client(conf);
-                       }
-                       move_to_confirmed(unconf);
-                       conf = unconf;
-                       nfsd4_probe_callback(conf);
-                       status = nfs_ok;
-               }
-       } else if ((!conf || (conf && !same_verf(&conf->cl_confirm, &confirm)))
-           && (!unconf || (unconf && !same_verf(&unconf->cl_confirm,
-                                                               &confirm)))) {
-               /*
-                * RFC 3530 14.2.34 CASE 4:
-                * Client probably hasn't noticed that we rebooted yet.
-                */
-               status = nfserr_stale_clientid;
-       } else {
-               /* check that we have hit one of the cases...*/
-               status = nfserr_clid_inuse;
+               move_to_confirmed(unconf);
+               nfsd4_probe_callback(unconf);
         }
  out:
         nfs4_unlock_state();
@@ -2454,8 +2413,8 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp,
         stp->st_file = fp;
         stp->st_access_bmap = 0;
         stp->st_deny_bmap = 0;
-       __set_bit(open->op_share_access, &stp->st_access_bmap);
-       __set_bit(open->op_share_deny, &stp->st_deny_bmap);
+       set_access(open->op_share_access, stp);
+       set_deny(open->op_share_deny, stp);
         stp->st_openstp = NULL;
  }
  
@@ -2534,8 +2493,8 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type)
         ret = nfserr_locked;
         /* Search for conflicting share reservations */
         list_for_each_entry(stp, &fp->fi_stateids, st_perfile) {
-               if (test_bit(deny_type, &stp->st_deny_bmap) ||
-                   test_bit(NFS4_SHARE_DENY_BOTH, &stp->st_deny_bmap))
+               if (test_deny(deny_type, stp) ||
+                   test_deny(NFS4_SHARE_DENY_BOTH, stp))
                         goto out;
         }
         ret = nfs_ok;
@@ -2791,7 +2750,7 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
         bool new_access;
         __be32 status;
  
-       new_access = !test_bit(op_share_access, &stp->st_access_bmap);
+       new_access = !test_access(op_share_access, stp);
         if (new_access) {
                 status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open);
                 if (status)
@@ -2806,8 +2765,8 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c
                 return status;
         }
         /* remember the open */
-       __set_bit(op_share_access, &stp->st_access_bmap);
-       __set_bit(open->op_share_deny, &stp->st_deny_bmap);
+       set_access(op_share_access, stp);
+       set_deny(open->op_share_deny, stp);
  
         return nfs_ok;
  }
@@ -3282,18 +3241,18 @@ STALE_STATEID(stateid_t *stateid)
  }
  
  static inline int
-access_permit_read(unsigned long access_bmap)
+access_permit_read(struct nfs4_ol_stateid *stp)
  {
-       return test_bit(NFS4_SHARE_ACCESS_READ, &access_bmap) ||
-               test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap) ||
-               test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap);
+       return test_access(NFS4_SHARE_ACCESS_READ, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp) ||
+               test_access(NFS4_SHARE_ACCESS_WRITE, stp);
  }
  
  static inline int
-access_permit_write(unsigned long access_bmap)
+access_permit_write(struct nfs4_ol_stateid *stp)
  {
-       return test_bit(NFS4_SHARE_ACCESS_WRITE, &access_bmap) ||
-               test_bit(NFS4_SHARE_ACCESS_BOTH, &access_bmap);
+       return test_access(NFS4_SHARE_ACCESS_WRITE, stp) ||
+               test_access(NFS4_SHARE_ACCESS_BOTH, stp);
  }
  
  static
@@ -3304,9 +3263,9 @@ __be32 nfs4_check_openmode(struct nfs4_ol_stateid *stp, int flags)
         /* For lock stateid's, we test the parent open, not the lock: */
         if (stp->st_openstp)
                 stp = stp->st_openstp;
-       if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap)))
+       if ((flags & WR_STATE) && !access_permit_write(stp))
                  goto out;
-       if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap)))
+       if ((flags & RD_STATE) && !access_permit_read(stp))
                  goto out;
         status = nfs_ok;
  out:
@@ -3346,7 +3305,7 @@ static bool stateid_generation_after(stateid_t *a, stateid_t *b)
         return (s32)a->si_generation - (s32)b->si_generation > 0;
  }
  
-static int check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
+static __be32 check_stateid_generation(stateid_t *in, stateid_t *ref, bool has_session)
  {
         /*
          * When sessions are used the stateid generation number is ignored
@@ -3655,10 +3614,10 @@ out:
  
  static inline void nfs4_stateid_downgrade_bit(struct nfs4_ol_stateid *stp, u32 access)
  {
-       if (!test_bit(access, &stp->st_access_bmap))
+       if (!test_access(access, stp))
                 return;
         nfs4_file_put_access(stp->st_file, nfs4_access_to_omode(access));
-       __clear_bit(access, &stp->st_access_bmap);
+       clear_access(access, stp);
  }
  
  static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_access)
@@ -3680,12 +3639,12 @@ static inline void nfs4_stateid_downgrade(struct nfs4_ol_stateid *stp, u32 to_ac
  }
  
  static void
-reset_union_bmap_deny(unsigned long deny, unsigned long *bmap)
+reset_union_bmap_deny(unsigned long deny, struct nfs4_ol_stateid *stp)
  {
         int i;
         for (i = 0; i < 4; i++) {
                 if ((i & deny) != i)
-                       __clear_bit(i, bmap);
+                       clear_deny(i, stp);
         }
  }
  
@@ -3712,19 +3671,19 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
         if (status)
                 goto out; 
         status = nfserr_inval;
-       if (!test_bit(od->od_share_access, &stp->st_access_bmap)) {
-               dprintk("NFSD:access not a subset current bitmap: 0x%lx, input access=%08x\n",
+       if (!test_access(od->od_share_access, stp)) {
+               dprintk("NFSD: access not a subset current bitmap: 0x%lx, input access=%08x\n",
                         stp->st_access_bmap, od->od_share_access);
                 goto out;
         }
-       if (!test_bit(od->od_share_deny, &stp->st_deny_bmap)) {
+       if (!test_deny(od->od_share_deny, stp)) {
                 dprintk("NFSD:deny not a subset current bitmap: 0x%lx, input deny=%08x\n",
                         stp->st_deny_bmap, od->od_share_deny);
                 goto out;
         }
         nfs4_stateid_downgrade(stp, od->od_share_access);
  
-       reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap);
+       reset_union_bmap_deny(od->od_share_deny, stp);
  
         update_stateid(&stp->st_stid.sc_stateid);
         memcpy(&od->od_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
@@ -4014,13 +3973,13 @@ static void get_lock_access(struct nfs4_ol_stateid *lock_stp, u32 access)
         struct nfs4_file *fp = lock_stp->st_file;
         int oflag = nfs4_access_to_omode(access);
  
-       if (test_bit(access, &lock_stp->st_access_bmap))
+       if (test_access(access, lock_stp))
                 return;
         nfs4_file_get_access(fp, oflag);
-       __set_bit(access, &lock_stp->st_access_bmap);
+       set_access(access, lock_stp);
  }
  
-__be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
+static __be32 lookup_or_create_lock_state(struct nfsd4_compound_state *cstate, struct nfs4_ol_stateid *ost, struct nfsd4_lock *lock, struct nfs4_ol_stateid **lst, bool *new)
  {
         struct nfs4_file *fi = ost->st_file;
         struct nfs4_openowner *oo = openowner(ost->st_stateowner);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c

index 74c00bc92b9af6b01e95e55c119b90d61fbf9d34..4949667c84ea0c3d687a46faf0a455c410c39b6f 100644 (file)
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1674,12 +1674,12 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
  
  static void write32(__be32 **p, u32 n)
  {
-       *(*p)++ = n;
+       *(*p)++ = htonl(n);
  }
  
  static void write64(__be32 **p, u64 n)
  {
-       write32(p, (u32)(n >> 32));
+       write32(p, (n >> 32));
         write32(p, (u32)n);
  }
  
@@ -1744,15 +1744,16 @@ static void encode_seqid_op_tail(struct nfsd4_compoundres *resp, __be32 *save, _
  }
  
  /* Encode as an array of strings the string given with components
- * separated @sep.
+ * separated @sep, escaped with esc_enter and esc_exit.
   */
-static __be32 nfsd4_encode_components(char sep, char *components,
-                                  __be32 **pp, int *buflen)
+static __be32 nfsd4_encode_components_esc(char sep, char *components,
+                                  __be32 **pp, int *buflen,
+                                  char esc_enter, char esc_exit)
  {
         __be32 *p = *pp;
         __be32 *countp = p;
         int strlen, count=0;
-       char *str, *end;
+       char *str, *end, *next;
  
         dprintk("nfsd4_encode_components(%s)\n", components);
         if ((*buflen -= 4) < 0)
@@ -1760,8 +1761,23 @@ static __be32 nfsd4_encode_components(char sep, char *components,
         WRITE32(0); /* We will fill this in with @count later */
         end = str = components;
         while (*end) {
-               for (; *end && (*end != sep); end++)
-                       ; /* Point to end of component */
+               bool found_esc = false;
+
+               /* try to parse as esc_start, ..., esc_end, sep */
+               if (*str == esc_enter) {
+                       for (; *end && (*end != esc_exit); end++)
+                               /* find esc_exit or end of string */;
+                       next = end + 1;
+                       if (*end && (!*next || *next == sep)) {
+                               str++;
+                               found_esc = true;
+                       }
+               }
+
+               if (!found_esc)
+                       for (; *end && (*end != sep); end++)
+                               /* find sep or end of string */;
+
                 strlen = end - str;
                 if (strlen) {
                         if ((*buflen -= ((XDR_QUADLEN(strlen) << 2) + 4)) < 0)
@@ -1780,6 +1796,15 @@ static __be32 nfsd4_encode_components(char sep, char *components,
         return 0;
  }
  
+/* Encode as an array of strings the string given with components
+ * separated @sep.
+ */
+static __be32 nfsd4_encode_components(char sep, char *components,
+                                  __be32 **pp, int *buflen)
+{
+       return nfsd4_encode_components_esc(sep, components, pp, buflen, 0, 0);
+}
+
  /*
   * encode a location element of a fs_locations structure
   */
@@ -1789,7 +1814,8 @@ static __be32 nfsd4_encode_fs_location4(struct nfsd4_fs_location *location,
         __be32 status;
         __be32 *p = *pp;
  
-       status = nfsd4_encode_components(':', location->hosts, &p, buflen);
+       status = nfsd4_encode_components_esc(':', location->hosts, &p, buflen,
+                                               '[', ']');
         if (status)
                 return status;
         status = nfsd4_encode_components('/', location->path, &p, buflen);
@@ -3251,7 +3277,7 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
  }
  
  static __be32
-nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, __be32 nfserr,
                          struct nfsd4_exchange_id *exid)
  {
         __be32 *p;
@@ -3306,7 +3332,7 @@ nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
  }
  
  static __be32
-nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_create_session(struct nfsd4_compoundres *resp, __be32 nfserr,
                             struct nfsd4_create_session *sess)
  {
         __be32 *p;
@@ -3355,14 +3381,14 @@ nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
  }
  
  static __be32
-nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, __be32 nfserr,
                              struct nfsd4_destroy_session *destroy_session)
  {
         return nfserr;
  }
  
  static __be32
-nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
                           struct nfsd4_free_stateid *free_stateid)
  {
         __be32 *p;
@@ -3371,13 +3397,13 @@ nfsd4_encode_free_stateid(struct nfsd4_compoundres *resp, int nfserr,
                 return nfserr;
  
         RESERVE_SPACE(4);
-       WRITE32(nfserr);
+       *p++ = nfserr;
         ADJUST_ARGS();
         return nfserr;
  }
  
  static __be32
-nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
+nfsd4_encode_sequence(struct nfsd4_compoundres *resp, __be32 nfserr,
                       struct nfsd4_sequence *seq)
  {
         __be32 *p;
@@ -3399,8 +3425,8 @@ nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
         return 0;
  }
  
-__be32
-nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, int nfserr,
+static __be32
+nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
                           struct nfsd4_test_stateid *test_stateid)
  {
         struct nfsd4_test_stateid_id *stateid, *next;
@@ -3503,7 +3529,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
   * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
   * will be at least a page and will therefore hold the xdr_buf head.
   */
-int nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *resp, u32 pad)
  {
         struct xdr_buf *xb = &resp->rqstp->rq_res;
         struct nfsd4_session *session = NULL;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c

index 72699885ac4892d4faf08bad18c1c09a286d8a8b..c55298ed5772577e5afe3bd613c3e5a0df3b69dd 100644 (file)
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -661,6 +661,7 @@ static ssize_t __write_ports_addfd(char *buf)
  {
         char *mesg = buf;
         int fd, err;
+       struct net *net = &init_net;
  
         err = get_int(&mesg, &fd);
         if (err != 0 || fd < 0)
@@ -672,6 +673,8 @@ static ssize_t __write_ports_addfd(char *buf)
  
         err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT);
         if (err < 0) {
+               if (nfsd_serv->sv_nrthreads == 1)
+                       svc_shutdown_net(nfsd_serv, net);
                 svc_destroy(nfsd_serv);
                 return err;
         }
@@ -709,6 +712,7 @@ static ssize_t __write_ports_addxprt(char *buf)
         char transport[16];
         struct svc_xprt *xprt;
         int port, err;
+       struct net *net = &init_net;
  
         if (sscanf(buf, "%15s %4u", transport, &port) != 2)
                 return -EINVAL;
@@ -720,12 +724,12 @@ static ssize_t __write_ports_addxprt(char *buf)
         if (err != 0)
                 return err;
  
-       err = svc_create_xprt(nfsd_serv, transport, &init_net,
+       err = svc_create_xprt(nfsd_serv, transport, net,
                                 PF_INET, port, SVC_SOCK_ANONYMOUS);
         if (err < 0)
                 goto out_err;
  
-       err = svc_create_xprt(nfsd_serv, transport, &init_net,
+       err = svc_create_xprt(nfsd_serv, transport, net,
                                 PF_INET6, port, SVC_SOCK_ANONYMOUS);
         if (err < 0 && err != -EAFNOSUPPORT)
                 goto out_close;
@@ -734,12 +738,14 @@ static ssize_t __write_ports_addxprt(char *buf)
         nfsd_serv->sv_nrthreads--;
         return 0;
  out_close:
-       xprt = svc_find_xprt(nfsd_serv, transport, &init_net, PF_INET, port);
+       xprt = svc_find_xprt(nfsd_serv, transport, net, PF_INET, port);
         if (xprt != NULL) {
                 svc_close_xprt(xprt);
                 svc_xprt_put(xprt);
         }
  out_err:
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
         svc_destroy(nfsd_serv);
         return err;
  }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c

index cb4d51d8cbdb3818cae8a4404d3e65b0aca84799..ee709fc8f58bc0b62a3f7ca64104630fe803b6d0 100644 (file)
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -11,6 +11,7 @@
  #include <linux/module.h>
  #include <linux/fs_struct.h>
  #include <linux/swap.h>
+#include <linux/nsproxy.h>
  
  #include <linux/sunrpc/stats.h>
  #include <linux/sunrpc/svcsock.h>
@@ -330,6 +331,8 @@ static int nfsd_get_default_max_blksize(void)
  
  int nfsd_create_serv(void)
  {
+       int error;
+
         WARN_ON(!mutex_is_locked(&nfsd_mutex));
         if (nfsd_serv) {
                 svc_get(nfsd_serv);
@@ -343,6 +346,12 @@ int nfsd_create_serv(void)
         if (nfsd_serv == NULL)
                 return -ENOMEM;
  
+       error = svc_bind(nfsd_serv, current->nsproxy->net_ns);
+       if (error < 0) {
+               svc_destroy(nfsd_serv);
+               return error;
+       }
+
         set_max_drc();
         do_gettimeofday(&nfssvc_boot);          /* record boot time */
         return 0;
@@ -373,6 +382,7 @@ int nfsd_set_nrthreads(int n, int *nthreads)
         int i = 0;
         int tot = 0;
         int err = 0;
+       struct net *net = &init_net;
  
         WARN_ON(!mutex_is_locked(&nfsd_mutex));
  
@@ -417,6 +427,9 @@ int nfsd_set_nrthreads(int n, int *nthreads)
                 if (err)
                         break;
         }
+
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
         svc_destroy(nfsd_serv);
  
         return err;
@@ -432,6 +445,7 @@ nfsd_svc(unsigned short port, int nrservs)
  {
         int     error;
         bool    nfsd_up_before;
+       struct net *net = &init_net;
  
         mutex_lock(&nfsd_mutex);
         dprintk("nfsd: creating service\n");
@@ -464,6 +478,8 @@ out_shutdown:
         if (error < 0 && !nfsd_up_before)
                 nfsd_shutdown();
  out_destroy:
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
         svc_destroy(nfsd_serv);         /* Release server */
  out:
         mutex_unlock(&nfsd_mutex);
@@ -547,6 +563,9 @@ nfsd(void *vrqstp)
         nfsdstats.th_cnt --;
  
  out:
+       if (rqstp->rq_server->sv_nrthreads == 1)
+               svc_shutdown_net(rqstp->rq_server, &init_net);
+
         /* Release the thread */
         svc_exit_thread(rqstp);
  
@@ -659,8 +678,12 @@ int nfsd_pool_stats_open(struct inode *inode, struct file *file)
  int nfsd_pool_stats_release(struct inode *inode, struct file *file)
  {
         int ret = seq_release(inode, file);
+       struct net *net = &init_net;
+
         mutex_lock(&nfsd_mutex);
         /* this function really, really should have been called svc_put() */
+       if (nfsd_serv->sv_nrthreads == 1)
+               svc_shutdown_net(nfsd_serv, net);
         svc_destroy(nfsd_serv);
         mutex_unlock(&nfsd_mutex);
         return ret;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h

index 89ab137d379a3f6756b5b5616083862e8f22d88f..849091e16ea6afd43e4ddd2dbd17962fdd87ad85 100644 (file)
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -232,7 +232,6 @@ struct nfs4_client {
         time_t                  cl_time;        /* time of last lease renewal */
         struct sockaddr_storage cl_addr;        /* client ipaddress */
         u32                     cl_flavor;      /* setclientid pseudoflavor */
-       char                    *cl_principal;  /* setclientid principal name */
         struct svc_cred         cl_cred;        /* setclientid principal */
         clientid_t              cl_clientid;    /* generated by server */
         nfs4_verifier           cl_confirm;     /* generated by server */
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h

index 1b3501598ab5dbb4609ba19e4f7c3322b29f70ba..acd127d4ee821660e71fe1e38ef1c804962f6508 100644 (file)
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -60,7 +60,7 @@ struct nfsd4_compound_state {
         __be32                  *datap;
         size_t                  iovlen;
         u32                     minorversion;
-       u32                     status;
+       __be32                  status;
         stateid_t       current_stateid;
         stateid_t       save_stateid;
         /* to indicate current and saved state id presents */
@@ -364,7 +364,7 @@ struct nfsd4_test_stateid_id {
  };
  
  struct nfsd4_test_stateid {
-       __be32          ts_num_ids;
+       u32             ts_num_ids;
         struct list_head ts_stateid_list;
  };
  
@@ -549,7 +549,7 @@ int nfs4svc_decode_compoundargs(struct svc_rqst *, __be32 *,
                 struct nfsd4_compoundargs *);
  int nfs4svc_encode_compoundres(struct svc_rqst *, __be32 *,
                 struct nfsd4_compoundres *);
-int nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
+__be32 nfsd4_check_resp_size(struct nfsd4_compoundres *, u32);
  void nfsd4_encode_operation(struct nfsd4_compoundres *, struct nfsd4_op *);
  void nfsd4_encode_replay(struct nfsd4_compoundres *resp, struct nfsd4_op *op);
  __be32 nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c

index 0bb2c2010b9512ba5fd971fdbdc34eba2abab886..b72847988b78d96d99b7571d17fea769e463c6b0 100644 (file)
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -508,31 +508,29 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh,
         return nilfs_get_dentry(sb, fid->cno, fid->parent_ino, fid->parent_gen);
  }
  
-static int nilfs_encode_fh(struct dentry *dentry, __u32 *fh, int *lenp,
-                          int connectable)
+static int nilfs_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+                          struct inode *parent)
  {
         struct nilfs_fid *fid = (struct nilfs_fid *)fh;
-       struct inode *inode = dentry->d_inode;
         struct nilfs_root *root = NILFS_I(inode)->i_root;
         int type;
  
-       if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE ||
-           (connectable && *lenp < NILFS_FID_SIZE_CONNECTABLE))
+       if (parent && *lenp < NILFS_FID_SIZE_CONNECTABLE) {
+               *lenp = NILFS_FID_SIZE_CONNECTABLE;
+               return 255;
+       }
+       if (*lenp < NILFS_FID_SIZE_NON_CONNECTABLE) {
+               *lenp = NILFS_FID_SIZE_NON_CONNECTABLE;
                 return 255;
+       }
  
         fid->cno = root->cno;
         fid->ino = inode->i_ino;
         fid->gen = inode->i_generation;
  
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                 fid->parent_ino = parent->i_ino;
                 fid->parent_gen = parent->i_generation;
-               spin_unlock(&dentry->d_lock);
-
                 type = FILEID_NILFS_WITH_PARENT;
                 *lenp = NILFS_FID_SIZE_CONNECTABLE;
         } else {
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c

index ccb14d3fc0de99790d282ae17ce984338ca309ab..b39c5c161adb64bff0d33faa64f41d8f4a9942cd 100644 (file)
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -123,7 +123,7 @@ int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
  }
  EXPORT_SYMBOL_GPL(__fsnotify_parent);
  
-static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
+static int send_to_group(struct inode *to_tell,
                          struct fsnotify_mark *inode_mark,
                          struct fsnotify_mark *vfsmount_mark,
                          __u32 mask, void *data,
@@ -168,10 +168,10 @@ static int send_to_group(struct inode *to_tell, struct vfsmount *mnt,
                         vfsmount_test_mask &= ~inode_mark->ignored_mask;
         }
  
-       pr_debug("%s: group=%p to_tell=%p mnt=%p mask=%x inode_mark=%p"
+       pr_debug("%s: group=%p to_tell=%p mask=%x inode_mark=%p"
                  " inode_test_mask=%x vfsmount_mark=%p vfsmount_test_mask=%x"
                  " data=%p data_is=%d cookie=%d event=%p\n",
-                __func__, group, to_tell, mnt, mask, inode_mark,
+                __func__, group, to_tell, mask, inode_mark,
                  inode_test_mask, vfsmount_mark, vfsmount_test_mask, data,
                  data_is, cookie, *event);
  
@@ -258,16 +258,16 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
  
                 if (inode_group > vfsmount_group) {
                         /* handle inode */
-                       ret = send_to_group(to_tell, NULL, inode_mark, NULL, mask, data,
+                       ret = send_to_group(to_tell, inode_mark, NULL, mask, data,
                                             data_is, cookie, file_name, &event);
                         /* we didn't use the vfsmount_mark */
                         vfsmount_group = NULL;
                 } else if (vfsmount_group > inode_group) {
-                       ret = send_to_group(to_tell, &mnt->mnt, NULL, vfsmount_mark, mask, data,
+                       ret = send_to_group(to_tell, NULL, vfsmount_mark, mask, data,
                                             data_is, cookie, file_name, &event);
                         inode_group = NULL;
                 } else {
-                       ret = send_to_group(to_tell, &mnt->mnt, inode_mark, vfsmount_mark,
+                       ret = send_to_group(to_tell, inode_mark, vfsmount_mark,
                                             mask, data, data_is, cookie, file_name,
                                             &event);
                 }
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c

index 8639169221c7aed21c0bd600ab4ef1a0d8102cb1..7389d2d5e51d257c72f9fb0c1468c38a28b309e4 100644 (file)
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2096,7 +2096,9 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
         err = file_remove_suid(file);
         if (err)
                 goto out;
-       file_update_time(file);
+       err = file_update_time(file);
+       if (err)
+               goto out;
         written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
                         count);
  out:
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c

index c7ee03c22226253d970cce94beb11f6353b3e1d0..0725e605465040b6b1e7c5e7744c5243968158c9 100644 (file)
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -422,45 +422,46 @@ int ocfs2_block_check_validate(void *data, size_t blocksize,
                                struct ocfs2_blockcheck_stats *stats)
  {
         int rc = 0;
-       struct ocfs2_block_check check;
+       u32 bc_crc32e;
+       u16 bc_ecc;
         u32 crc, ecc;
  
         ocfs2_blockcheck_inc_check(stats);
  
-       check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
-       check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+       bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+       bc_ecc = le16_to_cpu(bc->bc_ecc);
  
         memset(bc, 0, sizeof(struct ocfs2_block_check));
  
         /* Fast path - if the crc32 validates, we're good to go */
         crc = crc32_le(~0, data, blocksize);
-       if (crc == check.bc_crc32e)
+       if (crc == bc_crc32e)
                 goto out;
  
         ocfs2_blockcheck_inc_failure(stats);
         mlog(ML_ERROR,
              "CRC32 failed: stored: 0x%x, computed 0x%x. Applying ECC.\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
  
         /* Ok, try ECC fixups */
         ecc = ocfs2_hamming_encode_block(data, blocksize);
-       ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+       ocfs2_hamming_fix_block(data, blocksize, ecc ^ bc_ecc);
  
         /* And check the crc32 again */
         crc = crc32_le(~0, data, blocksize);
-       if (crc == check.bc_crc32e) {
+       if (crc == bc_crc32e) {
                 ocfs2_blockcheck_inc_recover(stats);
                 goto out;
         }
  
         mlog(ML_ERROR, "Fixed CRC32 failed: stored: 0x%x, computed 0x%x\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
  
         rc = -EIO;
  
  out:
-       bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
-       bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+       bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+       bc->bc_ecc = cpu_to_le16(bc_ecc);
  
         return rc;
  }
@@ -528,7 +529,8 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
                                    struct ocfs2_blockcheck_stats *stats)
  {
         int i, rc = 0;
-       struct ocfs2_block_check check;
+       u32 bc_crc32e;
+       u16 bc_ecc;
         u32 crc, ecc, fix;
  
         BUG_ON(nr < 0);
@@ -538,21 +540,21 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
  
         ocfs2_blockcheck_inc_check(stats);
  
-       check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
-       check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+       bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+       bc_ecc = le16_to_cpu(bc->bc_ecc);
  
         memset(bc, 0, sizeof(struct ocfs2_block_check));
  
         /* Fast path - if the crc32 validates, we're good to go */
         for (i = 0, crc = ~0; i < nr; i++)
                 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
-       if (crc == check.bc_crc32e)
+       if (crc == bc_crc32e)
                 goto out;
  
         ocfs2_blockcheck_inc_failure(stats);
         mlog(ML_ERROR,
              "CRC32 failed: stored: %u, computed %u.  Applying ECC.\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
  
         /* Ok, try ECC fixups */
         for (i = 0, ecc = 0; i < nr; i++) {
@@ -565,7 +567,7 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
                                                 bhs[i]->b_size * 8,
                                                 bhs[i]->b_size * 8 * i);
         }
-       fix = ecc ^ check.bc_ecc;
+       fix = ecc ^ bc_ecc;
         for (i = 0; i < nr; i++) {
                 /*
                  * Try the fix against each buffer.  It will only affect
@@ -578,19 +580,19 @@ int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
         /* And check the crc32 again */
         for (i = 0, crc = ~0; i < nr; i++)
                 crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
-       if (crc == check.bc_crc32e) {
+       if (crc == bc_crc32e) {
                 ocfs2_blockcheck_inc_recover(stats);
                 goto out;
         }
  
         mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
-            (unsigned int)check.bc_crc32e, (unsigned int)crc);
+            (unsigned int)bc_crc32e, (unsigned int)crc);
  
         rc = -EIO;
  
  out:
-       bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
-       bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+       bc->bc_crc32e = cpu_to_le32(bc_crc32e);
+       bc->bc_ecc = cpu_to_le16(bc_ecc);
  
         return rc;
  }
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c

index 3a3ed4bb794b0d6c75e7e321b042b1b4128fbd27..fbec0be6232622ddda0c3ed4ed49c50cc0129386 100644 (file)
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -293,7 +293,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
         struct dlm_proxy_ast *past = (struct dlm_proxy_ast *) msg->buf;
         char *name;
         struct list_head *iter, *head=NULL;
-       u64 cookie;
+       __be64 cookie;
         u32 flags;
         u8 node;
  
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h

index a5952ceecba5a83147389ad4a1cd24972ee0bfbe..de854cca12a2d23dea5652d3dad38461c7dbde13 100644 (file)
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -679,7 +679,7 @@ struct dlm_query_join_packet {
  };
  
  union dlm_query_join_response {
-       u32 intval;
+       __be32 intval;
         struct dlm_query_join_packet packet;
  };
  
@@ -755,8 +755,8 @@ struct dlm_query_region {
  struct dlm_node_info {
         u8 ni_nodenum;
         u8 pad1;
-       u16 ni_ipv4_port;
-       u32 ni_ipv4_address;
+       __be16 ni_ipv4_port;
+       __be32 ni_ipv4_address;
  };
  
  struct dlm_query_nodeinfo {
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c

index 92f2ead0fab6de22fa138cc4410dee6e1544216c..9e89d70df337fc98836e87f90e38887a716843e6 100644 (file)
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -818,7 +818,7 @@ static void dlm_query_join_packet_to_wire(struct dlm_query_join_packet *packet,
         union dlm_query_join_response response;
  
         response.packet = *packet;
-       *wire = cpu_to_be32(response.intval);
+       *wire = be32_to_cpu(response.intval);
  }
  
  static void dlm_query_join_wire_to_packet(u32 wire,
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c

index 745db42528d5fd2f875a099177158f347fd361e7..322216a5f0dd1e0f2e178540781b3c6fd263c985 100644 (file)
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -177,21 +177,23 @@ bail:
         return parent;
  }
  
-static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
-                          int connectable)
+static int ocfs2_encode_fh(struct inode *inode, u32 *fh_in, int *max_len,
+                          struct inode *parent)
  {
-       struct inode *inode = dentry->d_inode;
         int len = *max_len;
         int type = 1;
         u64 blkno;
         u32 generation;
         __le32 *fh = (__force __le32 *) fh_in;
  
+#ifdef TRACE_HOOKS_ARE_NOT_BRAINDEAD_IN_YOUR_OPINION
+#error "You go ahead and fix that mess, then.  Somehow"
         trace_ocfs2_encode_fh_begin(dentry, dentry->d_name.len,
                                     dentry->d_name.name,
                                     fh, len, connectable);
+#endif
  
-       if (connectable && (len < 6)) {
+       if (parent && (len < 6)) {
                 *max_len = 6;
                 type = 255;
                 goto bail;
@@ -211,12 +213,7 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
         fh[1] = cpu_to_le32((u32)(blkno & 0xffffffff));
         fh[2] = cpu_to_le32(generation);
  
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               struct inode *parent;
-
-               spin_lock(&dentry->d_lock);
-
-               parent = dentry->d_parent->d_inode;
+       if (parent) {
                 blkno = OCFS2_I(parent)->ip_blkno;
                 generation = parent->i_generation;
  
@@ -224,8 +221,6 @@ static int ocfs2_encode_fh(struct dentry *dentry, u32 *fh_in, int *max_len,
                 fh[4] = cpu_to_le32((u32)(blkno & 0xffffffff));
                 fh[5] = cpu_to_le32(generation);
  
-               spin_unlock(&dentry->d_lock);
-
                 len = 6;
                 type = 2;
  
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c

index 735514ca400f7942268dff8f387c7c029506b859..d89e08a81eda8875fcd59d76c5e1d75827e7d7ac 100644 (file)
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -273,11 +273,13 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
         inode->i_gid = le32_to_cpu(fe->i_gid);
  
         /* Fast symlinks will have i_size but no allocated clusters. */
-       if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
+       if (S_ISLNK(inode->i_mode) && !fe->i_clusters) {
                 inode->i_blocks = 0;
-       else
+               inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
+       } else {
                 inode->i_blocks = ocfs2_inode_sector_count(inode);
-       inode->i_mapping->a_ops = &ocfs2_aops;
+               inode->i_mapping->a_ops = &ocfs2_aops;
+       }
         inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
         inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
         inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
@@ -331,10 +333,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
                     OCFS2_I(inode)->ip_dir_lock_gen = 1;
                     break;
             case S_IFLNK:
-                   if (ocfs2_inode_is_fast_symlink(inode))
-                       inode->i_op = &ocfs2_fast_symlink_inode_operations;
-                   else
-                       inode->i_op = &ocfs2_symlink_inode_operations;
+                   inode->i_op = &ocfs2_symlink_inode_operations;
                     i_size_write(inode, le64_to_cpu(fe->i_size));
                     break;
             default:
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c

index a1a1bfd652c90d49521ad3ea12a908f9a168c1e9..d96f7f81d8dd3257f49bb02885db296af22881cb 100644 (file)
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -864,7 +864,7 @@ int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info,
                 if (status)
                         break;
  
-               reqp = (struct ocfs2_info_request *)(unsigned long)req_addr;
+               reqp = (struct ocfs2_info_request __user *)(unsigned long)req_addr;
                 if (!reqp) {
                         status = -EINVAL;
                         goto bail;
@@ -888,9 +888,11 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
         struct ocfs2_space_resv sr;
         struct ocfs2_new_group_input input;
         struct reflink_arguments args;
-       const char *old_path, *new_path;
+       const char __user *old_path;
+       const char __user *new_path;
         bool preserve;
         struct ocfs2_info info;
+       void __user *argp = (void __user *)arg;
  
         switch (cmd) {
         case OCFS2_IOC_GETFLAGS:
@@ -937,17 +939,15 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  
                 return ocfs2_group_add(inode, &input);
         case OCFS2_IOC_REFLINK:
-               if (copy_from_user(&args, (struct reflink_arguments *)arg,
-                                  sizeof(args)))
+               if (copy_from_user(&args, argp, sizeof(args)))
                         return -EFAULT;
-               old_path = (const char *)(unsigned long)args.old_path;
-               new_path = (const char *)(unsigned long)args.new_path;
+               old_path = (const char __user *)(unsigned long)args.old_path;
+               new_path = (const char __user *)(unsigned long)args.new_path;
                 preserve = (args.preserve != 0);
  
                 return ocfs2_reflink_ioctl(inode, old_path, new_path, preserve);
         case OCFS2_IOC_INFO:
-               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
-                                  sizeof(struct ocfs2_info)))
+               if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
                         return -EFAULT;
  
                 return ocfs2_info_handle(inode, &info, 0);
@@ -960,22 +960,20 @@ long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                 if (!capable(CAP_SYS_ADMIN))
                         return -EPERM;
  
-               if (copy_from_user(&range, (struct fstrim_range *)arg,
-                   sizeof(range)))
+               if (copy_from_user(&range, argp, sizeof(range)))
                         return -EFAULT;
  
                 ret = ocfs2_trim_fs(sb, &range);
                 if (ret < 0)
                         return ret;
  
-               if (copy_to_user((struct fstrim_range *)arg, &range,
-                   sizeof(range)))
+               if (copy_to_user(argp, &range, sizeof(range)))
                         return -EFAULT;
  
                 return 0;
         }
         case OCFS2_IOC_MOVE_EXT:
-               return ocfs2_ioctl_move_extents(filp, (void __user *)arg);
+               return ocfs2_ioctl_move_extents(filp, argp);
         default:
                 return -ENOTTY;
         }
@@ -988,6 +986,7 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
         struct reflink_arguments args;
         struct inode *inode = file->f_path.dentry->d_inode;
         struct ocfs2_info info;
+       void __user *argp = (void __user *)arg;
  
         switch (cmd) {
         case OCFS2_IOC32_GETFLAGS:
@@ -1006,16 +1005,14 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
         case FITRIM:
                 break;
         case OCFS2_IOC_REFLINK:
-               if (copy_from_user(&args, (struct reflink_arguments *)arg,
-                                  sizeof(args)))
+               if (copy_from_user(&args, argp, sizeof(args)))
                         return -EFAULT;
                 preserve = (args.preserve != 0);
  
                 return ocfs2_reflink_ioctl(inode, compat_ptr(args.old_path),
                                            compat_ptr(args.new_path), preserve);
         case OCFS2_IOC_INFO:
-               if (copy_from_user(&info, (struct ocfs2_info __user *)arg,
-                                  sizeof(struct ocfs2_info)))
+               if (copy_from_user(&info, argp, sizeof(struct ocfs2_info)))
                         return -EFAULT;
  
                 return ocfs2_info_handle(inode, &info, 1);
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c

index b1e3fce72ea4767bf795c692e98faacebc42797c..6083432f667e3077eb466842ef0f00136d0b4b6f 100644 (file)
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -1082,8 +1082,7 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
         context->file = filp;
  
         if (argp) {
-               if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
-                                  sizeof(range))) {
+               if (copy_from_user(&range, argp, sizeof(range))) {
                         status = -EFAULT;
                         goto out;
                 }
@@ -1138,8 +1137,7 @@ out:
          * length and new_offset even if failure happens somewhere.
          */
         if (argp) {
-               if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
-                               sizeof(range)))
+               if (copy_to_user(argp, &range, sizeof(range)))
                         status = -EFAULT;
         }
  
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c

index a9856e3eaaf09753b4921d56ccdfb172db5cad7b..9f39c640cddf2076b951295dde5ef68217b26452 100644 (file)
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1724,15 +1724,16 @@ static int ocfs2_symlink(struct inode *dir,
         fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
         inode->i_rdev = 0;
         newsize = l - 1;
+       inode->i_op = &ocfs2_symlink_inode_operations;
         if (l > ocfs2_fast_symlink_chars(sb)) {
                 u32 offset = 0;
  
-               inode->i_op = &ocfs2_symlink_inode_operations;
                 status = dquot_alloc_space_nodirty(inode,
                     ocfs2_clusters_to_bytes(osb->sb, 1));
                 if (status)
                         goto bail;
                 did_quota = 1;
+               inode->i_mapping->a_ops = &ocfs2_aops;
                 status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
                                               new_fe_bh,
                                               handle, data_ac, NULL,
@@ -1750,7 +1751,7 @@ static int ocfs2_symlink(struct inode *dir,
                 i_size_write(inode, newsize);
                 inode->i_blocks = ocfs2_inode_sector_count(inode);
         } else {
-               inode->i_op = &ocfs2_fast_symlink_inode_operations;
+               inode->i_mapping->a_ops = &ocfs2_fast_symlink_aops;
                 memcpy((char *) fe->id2.i_symlink, symname, l);
                 i_size_write(inode, newsize);
                 inode->i_blocks = 0;
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c

index 5d22872e2bb36012b711ac7720a90bee24717b15..f1fbb4b552ad3649238becdd9c21d4b138d5c8d7 100644 (file)
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -54,101 +54,40 @@
  #include "buffer_head_io.h"
  
  
-static char *ocfs2_fast_symlink_getlink(struct inode *inode,
-                                       struct buffer_head **bh)
+static int ocfs2_fast_symlink_readpage(struct file *unused, struct page *page)
  {
-       int status;
-       char *link = NULL;
+       struct inode *inode = page->mapping->host;
+       struct buffer_head *bh;
+       int status = ocfs2_read_inode_block(inode, &bh);
         struct ocfs2_dinode *fe;
+       const char *link;
+       void *kaddr;
+       size_t len;
  
-       status = ocfs2_read_inode_block(inode, bh);
         if (status < 0) {
                 mlog_errno(status);
-               link = ERR_PTR(status);
-               goto bail;
+               return status;
         }
  
-       fe = (struct ocfs2_dinode *) (*bh)->b_data;
+       fe = (struct ocfs2_dinode *) bh->b_data;
         link = (char *) fe->id2.i_symlink;
-bail:
-
-       return link;
-}
-
-static int ocfs2_readlink(struct dentry *dentry,
-                         char __user *buffer,
-                         int buflen)
-{
-       int ret;
-       char *link;
-       struct buffer_head *bh = NULL;
-       struct inode *inode = dentry->d_inode;
-
-       link = ocfs2_fast_symlink_getlink(inode, &bh);
-       if (IS_ERR(link)) {
-               ret = PTR_ERR(link);
-               goto out;
-       }
-
-       /*
-        * Without vfsmount we can't update atime now,
-        * but we will update atime here ultimately.
-        */
-       ret = vfs_readlink(dentry, buffer, buflen, link);
-
+       /* will be less than a page size */
+       len = strnlen(link, ocfs2_fast_symlink_chars(inode->i_sb));
+       kaddr = kmap_atomic(page);
+       memcpy(kaddr, link, len + 1);
+       kunmap_atomic(kaddr);
+       SetPageUptodate(page);
+       unlock_page(page);
         brelse(bh);
-out:
-       if (ret < 0)
-               mlog_errno(ret);
-       return ret;
+       return 0;
  }
  
-static void *ocfs2_fast_follow_link(struct dentry *dentry,
-                                   struct nameidata *nd)
-{
-       int status = 0;
-       int len;
-       char *target, *link = ERR_PTR(-ENOMEM);
-       struct inode *inode = dentry->d_inode;
-       struct buffer_head *bh = NULL;
-
-       BUG_ON(!ocfs2_inode_is_fast_symlink(inode));
-       target = ocfs2_fast_symlink_getlink(inode, &bh);
-       if (IS_ERR(target)) {
-               status = PTR_ERR(target);
-               mlog_errno(status);
-               goto bail;
-       }
-
-       /* Fast symlinks can't be large */
-       len = strnlen(target, ocfs2_fast_symlink_chars(inode->i_sb));
-       link = kzalloc(len + 1, GFP_NOFS);
-       if (!link) {
-               status = -ENOMEM;
-               mlog_errno(status);
-               goto bail;
-       }
-
-       memcpy(link, target, len);
-
-bail:
-       nd_set_link(nd, status ? ERR_PTR(status) : link);
-       brelse(bh);
-
-       if (status)
-               mlog_errno(status);
-       return NULL;
-}
-
-static void ocfs2_fast_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
-{
-       char *link = nd_get_link(nd);
-       if (!IS_ERR(link))
-               kfree(link);
-}
+const struct address_space_operations ocfs2_fast_symlink_aops = {
+       .readpage               = ocfs2_fast_symlink_readpage,
+};
  
  const struct inode_operations ocfs2_symlink_inode_operations = {
-       .readlink       = page_readlink,
+       .readlink       = generic_readlink,
         .follow_link    = page_follow_link_light,
         .put_link       = page_put_link,
         .getattr        = ocfs2_getattr,
@@ -159,15 +98,3 @@ const struct inode_operations ocfs2_symlink_inode_operations = {
         .removexattr    = generic_removexattr,
         .fiemap         = ocfs2_fiemap,
  };
-const struct inode_operations ocfs2_fast_symlink_inode_operations = {
-       .readlink       = ocfs2_readlink,
-       .follow_link    = ocfs2_fast_follow_link,
-       .put_link       = ocfs2_fast_put_link,
-       .getattr        = ocfs2_getattr,
-       .setattr        = ocfs2_setattr,
-       .setxattr       = generic_setxattr,
-       .getxattr       = generic_getxattr,
-       .listxattr      = ocfs2_listxattr,
-       .removexattr    = generic_removexattr,
-       .fiemap         = ocfs2_fiemap,
-};
diff --git a/fs/ocfs2/symlink.h b/fs/ocfs2/symlink.h

index 65a6c9c6ad51d1018147cff4685743dd22bae935..71ee4245e9192274552ef9492412b36b068e72d6 100644 (file)
--- a/fs/ocfs2/symlink.h
+++ b/fs/ocfs2/symlink.h
@@ -27,7 +27,7 @@
  #define OCFS2_SYMLINK_H
  
  extern const struct inode_operations ocfs2_symlink_inode_operations;
-extern const struct inode_operations ocfs2_fast_symlink_inode_operations;
+extern const struct address_space_operations ocfs2_fast_symlink_aops;
  
  /*
   * Test whether an inode is a fast symlink.
diff --git a/fs/open.c b/fs/open.c

index d54301219d04f1c8fed18d6de15ed590a593e2ab..d6c79a0dffc7b0827b09562e11fa0f610af5657d 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -654,10 +654,23 @@ static inline int __get_file_write_access(struct inode *inode,
         return error;
  }
  
-static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
-                                       struct file *f,
-                                       int (*open)(struct inode *, struct file *),
-                                       const struct cred *cred)
+int open_check_o_direct(struct file *f)
+{
+       /* NB: we're sure to have correct a_ops only after f_op->open */
+       if (f->f_flags & O_DIRECT) {
+               if (!f->f_mapping->a_ops ||
+                   ((!f->f_mapping->a_ops->direct_IO) &&
+                   (!f->f_mapping->a_ops->get_xip_mem))) {
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+static struct file *do_dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+                                  struct file *f,
+                                  int (*open)(struct inode *, struct file *),
+                                  const struct cred *cred)
  {
         static const struct file_operations empty_fops = {};
         struct inode *inode;
@@ -713,16 +726,6 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
  
         file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
  
-       /* NB: we're sure to have correct a_ops only after f_op->open */
-       if (f->f_flags & O_DIRECT) {
-               if (!f->f_mapping->a_ops ||
-                   ((!f->f_mapping->a_ops->direct_IO) &&
-                   (!f->f_mapping->a_ops->get_xip_mem))) {
-                       fput(f);
-                       f = ERR_PTR(-EINVAL);
-               }
-       }
-
         return f;
  
  cleanup_all:
@@ -744,12 +747,29 @@ cleanup_all:
         f->f_path.dentry = NULL;
         f->f_path.mnt = NULL;
  cleanup_file:
-       put_filp(f);
         dput(dentry);
         mntput(mnt);
         return ERR_PTR(error);
  }
  
+static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
+                               struct file *f,
+                               int (*open)(struct inode *, struct file *),
+                               const struct cred *cred)
+{
+       struct file *res = do_dentry_open(dentry, mnt, f, open, cred);
+       if (!IS_ERR(res)) {
+               int error = open_check_o_direct(f);
+               if (error) {
+                       fput(res);
+                       res = ERR_PTR(error);
+               }
+       } else {
+               put_filp(f);
+       }
+       return res;
+}
+
  /**
   * lookup_instantiate_filp - instantiates the open intent filp
   * @nd: pointer to nameidata
@@ -804,13 +824,31 @@ struct file *nameidata_to_filp(struct nameidata *nd)
  
         /* Pick up the filp from the open intent */
         filp = nd->intent.open.file;
-       nd->intent.open.file = NULL;
  
         /* Has the filesystem initialised the file for us? */
-       if (filp->f_path.dentry == NULL) {
+       if (filp->f_path.dentry != NULL) {
+               nd->intent.open.file = NULL;
+       } else {
+               struct file *res;
+
                 path_get(&nd->path);
-               filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp,
-                                    NULL, cred);
+               res = do_dentry_open(nd->path.dentry, nd->path.mnt,
+                                    filp, NULL, cred);
+               if (!IS_ERR(res)) {
+                       int error;
+
+                       nd->intent.open.file = NULL;
+                       BUG_ON(res != filp);
+
+                       error = open_check_o_direct(filp);
+                       if (error) {
+                               fput(filp);
+                               filp = ERR_PTR(error);
+                       }
+               } else {
+                       /* Allow nd->intent.open.file to be recycled */
+                       filp = res;
+               }
         }
         return filp;
  }
diff --git a/fs/pipe.c b/fs/pipe.c

index 95ebb56de494de44efb6224ccfee1b96e4169267..49c1065256fd10d9d5fdca3cf449b1e56bd58a0a 100644 (file)
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -654,8 +654,11 @@ out:
                 wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
                 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
         }
-       if (ret > 0)
-               file_update_time(filp);
+       if (ret > 0) {
+               int err = file_update_time(filp);
+               if (err)
+                       ret = err;
+       }
         return ret;
  }
  
diff --git a/fs/pnode.c b/fs/pnode.c

index ab5fa9e1a79ac8277ac1cb51db6a92e55ba2c935..bed378db075813350362c39f423d1b4335240bfa 100644 (file)
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -257,12 +257,12 @@ int propagate_mnt(struct mount *dest_mnt, struct dentry *dest_dentry,
                 prev_src_mnt  = child;
         }
  out:
-       br_write_lock(vfsmount_lock);
+       br_write_lock(&vfsmount_lock);
         while (!list_empty(&tmp_list)) {
                 child = list_first_entry(&tmp_list, struct mount, mnt_hash);
                 umount_tree(child, 0, &umount_list);
         }
-       br_write_unlock(vfsmount_lock);
+       br_write_unlock(&vfsmount_lock);
         release_mounts(&umount_list);
         return ret;
  }
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c

index 12412852d88a94d574bacebb5e64200f202db852..5e289a7cbad17d8547458f1d8b2526f2e85e5cb9 100644 (file)
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -23,12 +23,12 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
  
         poll_wait(file, &p->ns->poll, wait);
  
-       br_read_lock(vfsmount_lock);
+       br_read_lock(&vfsmount_lock);
         if (p->m.poll_event != ns->event) {
                 p->m.poll_event = ns->event;
                 res |= POLLERR | POLLPRI;
         }
-       br_read_unlock(vfsmount_lock);
+       br_read_unlock(&vfsmount_lock);
  
         return res;
  }
diff --git a/fs/readdir.c b/fs/readdir.c

index cc0a8227cddf688f70e289c427666057ce98e613..39e3370d79cf1e6399843137e2d64165baf49a03 100644 (file)
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -108,11 +108,11 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
         int error;
         struct file * file;
         struct readdir_callback buf;
+       int fput_needed;
  
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
         if (!file)
-               goto out;
+               return -EBADF;
  
         buf.result = 0;
         buf.dirent = dirent;
@@ -121,8 +121,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd,
         if (buf.result)
                 error = buf.result;
  
-       fput(file);
-out:
+       fput_light(file, fput_needed);
         return error;
  }
  
@@ -195,16 +194,15 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
         struct file * file;
         struct linux_dirent __user * lastdirent;
         struct getdents_callback buf;
+       int fput_needed;
         int error;
  
-       error = -EFAULT;
         if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
  
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
         if (!file)
-               goto out;
+               return -EBADF;
  
         buf.current_dir = dirent;
         buf.previous = NULL;
@@ -221,8 +219,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
                 else
                         error = count - buf.count;
         }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
         return error;
  }
  
@@ -278,16 +275,15 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
         struct file * file;
         struct linux_dirent64 __user * lastdirent;
         struct getdents_callback64 buf;
+       int fput_needed;
         int error;
  
-       error = -EFAULT;
         if (!access_ok(VERIFY_WRITE, dirent, count))
-               goto out;
+               return -EFAULT;
  
-       error = -EBADF;
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
         if (!file)
-               goto out;
+               return -EBADF;
  
         buf.current_dir = dirent;
         buf.previous = NULL;
@@ -305,7 +301,6 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd,
                 else
                         error = count - buf.count;
         }
-       fput(file);
-out:
+       fput_light(file, fput_needed);
         return error;
  }
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c

index 59d06871a850dcebc966581f43c656bedba440f0..a6d4268fb6c11798db5f8339bd14ab297cd9b21f 100644 (file)
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1592,13 +1592,12 @@ struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
                 (fh_type == 6) ? fid->raw[5] : 0);
  }
  
-int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
-                      int need_parent)
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+                      struct inode *parent)
  {
-       struct inode *inode = dentry->d_inode;
         int maxlen = *lenp;
  
-       if (need_parent && (maxlen < 5)) {
+       if (parent && (maxlen < 5)) {
                 *lenp = 5;
                 return 255;
         } else if (maxlen < 3) {
@@ -1610,20 +1609,15 @@ int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
         data[1] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
         data[2] = inode->i_generation;
         *lenp = 3;
-       /* no room for directory info? return what we've stored so far */
-       if (maxlen < 5 || !need_parent)
-               return 3;
-
-       spin_lock(&dentry->d_lock);
-       inode = dentry->d_parent->d_inode;
-       data[3] = inode->i_ino;
-       data[4] = le32_to_cpu(INODE_PKEY(inode)->k_dir_id);
-       *lenp = 5;
-       if (maxlen >= 6) {
-               data[5] = inode->i_generation;
-               *lenp = 6;
-       }
-       spin_unlock(&dentry->d_lock);
+       if (parent) {
+               data[3] = parent->i_ino;
+               data[4] = le32_to_cpu(INODE_PKEY(parent)->k_dir_id);
+               *lenp = 5;
+               if (maxlen >= 6) {
+                       data[5] = parent->i_generation;
+                       *lenp = 6;
+               }
+       }
         return *lenp;
  }
  
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c

index b1a08573fe14277961aa3039ce0fb587d4f889cc..afcadcc03e8ac87c7f25f3e2393b3c108daaf91d 100644 (file)
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1923,6 +1923,8 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
          * the workqueue job (flush_async_commit) needs this lock
          */
         reiserfs_write_unlock(sb);
+
+       cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work);
         flush_workqueue(commit_wq);
  
         if (!reiserfs_mounted_fs_count) {
@@ -3231,8 +3233,6 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
                                th->t_trans_id, journal->j_trans_id);
         }
  
-       sb->s_dirt = 1;
-
         prepared = test_clear_buffer_journal_prepared(bh);
         clear_buffer_journal_restore_dirty(bh);
         /* already in this transaction, we are done */
@@ -3316,6 +3316,7 @@ int journal_mark_dirty(struct reiserfs_transaction_handle *th,
                 journal->j_first = cn;
                 journal->j_last = cn;
         }
+       reiserfs_schedule_old_flush(sb);
         return 0;
  }
  
@@ -3492,7 +3493,7 @@ static void flush_async_commits(struct work_struct *work)
  ** flushes any old transactions to disk
  ** ends the current transaction if it is too old
  */
-int reiserfs_flush_old_commits(struct super_block *sb)
+void reiserfs_flush_old_commits(struct super_block *sb)
  {
         time_t now;
         struct reiserfs_transaction_handle th;
@@ -3502,9 +3503,8 @@ int reiserfs_flush_old_commits(struct super_block *sb)
         /* safety check so we don't flush while we are replaying the log during
          * mount
          */
-       if (list_empty(&journal->j_journal_list)) {
-               return 0;
-       }
+       if (list_empty(&journal->j_journal_list))
+               return;
  
         /* check the current transaction.  If there are no writers, and it is
          * too old, finish it, and force the commit blocks to disk
@@ -3526,7 +3526,6 @@ int reiserfs_flush_old_commits(struct super_block *sb)
                         do_journal_end(&th, sb, 1, COMMIT_NOW | WAIT);
                 }
         }
-       return sb->s_dirt;
  }
  
  /*
@@ -3955,7 +3954,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
          ** it tells us if we should continue with the journal_end, or just return
          */
         if (!check_journal_end(th, sb, nblocks, flags)) {
-               sb->s_dirt = 1;
+               reiserfs_schedule_old_flush(sb);
                 wake_queued_writers(sb);
                 reiserfs_async_progress_wait(sb);
                 goto out;
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h

index a59d27126338e43939f8fc04942acf13c956f1e4..33215f57ea06ce3026ef2d488832a337a361bd71 100644 (file)
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -480,6 +480,11 @@ struct reiserfs_sb_info {
         struct dentry *priv_root;       /* root of /.reiserfs_priv */
         struct dentry *xattr_root;      /* root of /.reiserfs_priv/xattrs */
         int j_errno;
+
+       int work_queued;              /* non-zero delayed work is queued */
+       struct delayed_work old_work; /* old transactions flush delayed work */
+       spinlock_t old_work_lock;     /* protects old_work and work_queued */
+
  #ifdef CONFIG_QUOTA
         char *s_qf_names[MAXQUOTAS];
         int s_jquota_fmt;
@@ -2452,7 +2457,7 @@ struct reiserfs_transaction_handle *reiserfs_persistent_transaction(struct
  int reiserfs_end_persistent_transaction(struct reiserfs_transaction_handle *);
  int reiserfs_commit_page(struct inode *inode, struct page *page,
                          unsigned from, unsigned to);
-int reiserfs_flush_old_commits(struct super_block *);
+void reiserfs_flush_old_commits(struct super_block *);
  int reiserfs_commit_for_inode(struct inode *);
  int reiserfs_inode_needs_commit(struct inode *);
  void reiserfs_update_inode_transaction(struct inode *);
@@ -2487,6 +2492,7 @@ void reiserfs_abort(struct super_block *sb, int errno, const char *fmt, ...);
  int reiserfs_allocate_list_bitmaps(struct super_block *s,
                                    struct reiserfs_list_bitmap *, unsigned int);
  
+void reiserfs_schedule_old_flush(struct super_block *s);
  void add_save_link(struct reiserfs_transaction_handle *th,
                    struct inode *inode, int truncate);
  int remove_save_link(struct inode *inode, int truncate);
@@ -2611,8 +2617,8 @@ struct dentry *reiserfs_fh_to_dentry(struct super_block *sb, struct fid *fid,
                                      int fh_len, int fh_type);
  struct dentry *reiserfs_fh_to_parent(struct super_block *sb, struct fid *fid,
                                      int fh_len, int fh_type);
-int reiserfs_encode_fh(struct dentry *dentry, __u32 * data, int *lenp,
-                      int connectable);
+int reiserfs_encode_fh(struct inode *inode, __u32 * data, int *lenp,
+                      struct inode *parent);
  
  int reiserfs_truncate_file(struct inode *, int update_timestamps);
  void make_cpu_key(struct cpu_key *cpu_key, struct inode *inode, loff_t offset,
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c

index 9a17f63c3fd7f3618a44bdf946476e4957dcd50d..3ce02cff5e90bd1c26374e12e15a6f56ea8c8803 100644 (file)
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -200,7 +200,6 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
                                           (bmap_nr_new - bmap_nr)));
         PUT_SB_BLOCK_COUNT(s, block_count_new);
         PUT_SB_BMAP_NR(s, bmap_would_wrap(bmap_nr_new) ? : bmap_nr_new);
-       s->s_dirt = 1;
  
         journal_mark_dirty(&th, s, SB_BUFFER_WITH_SB(s));
  
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c

index c07b7d709447de1670e9caf8a7fe9bbca6593138..651ce767b55d8241e283b3001d7fc9e6d803b317 100644 (file)
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -72,20 +72,58 @@ static int reiserfs_sync_fs(struct super_block *s, int wait)
         if (!journal_begin(&th, s, 1))
                 if (!journal_end_sync(&th, s, 1))
                         reiserfs_flush_old_commits(s);
-       s->s_dirt = 0;  /* Even if it's not true.
-                        * We'll loop forever in sync_supers otherwise */
         reiserfs_write_unlock(s);
         return 0;
  }
  
-static void reiserfs_write_super(struct super_block *s)
+static void flush_old_commits(struct work_struct *work)
  {
+       struct reiserfs_sb_info *sbi;
+       struct super_block *s;
+
+       sbi = container_of(work, struct reiserfs_sb_info, old_work.work);
+       s = sbi->s_journal->j_work_sb;
+
+       spin_lock(&sbi->old_work_lock);
+       sbi->work_queued = 0;
+       spin_unlock(&sbi->old_work_lock);
+
         reiserfs_sync_fs(s, 1);
  }
  
+void reiserfs_schedule_old_flush(struct super_block *s)
+{
+       struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+       unsigned long delay;
+
+       if (s->s_flags & MS_RDONLY)
+               return;
+
+       spin_lock(&sbi->old_work_lock);
+       if (!sbi->work_queued) {
+               delay = msecs_to_jiffies(dirty_writeback_interval * 10);
+               queue_delayed_work(system_long_wq, &sbi->old_work, delay);
+               sbi->work_queued = 1;
+       }
+       spin_unlock(&sbi->old_work_lock);
+}
+
+static void cancel_old_flush(struct super_block *s)
+{
+       struct reiserfs_sb_info *sbi = REISERFS_SB(s);
+
+       cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+       spin_lock(&sbi->old_work_lock);
+       sbi->work_queued = 0;
+       spin_unlock(&sbi->old_work_lock);
+}
+
  static int reiserfs_freeze(struct super_block *s)
  {
         struct reiserfs_transaction_handle th;
+
+       cancel_old_flush(s);
+
         reiserfs_write_lock(s);
         if (!(s->s_flags & MS_RDONLY)) {
                 int err = journal_begin(&th, s, 1);
@@ -99,7 +137,6 @@ static int reiserfs_freeze(struct super_block *s)
                         journal_end_sync(&th, s, 1);
                 }
         }
-       s->s_dirt = 0;
         reiserfs_write_unlock(s);
         return 0;
  }
@@ -483,9 +520,6 @@ static void reiserfs_put_super(struct super_block *s)
  
         reiserfs_write_lock(s);
  
-       if (s->s_dirt)
-               reiserfs_write_super(s);
-
         /* change file system state to current state if it was mounted with read-write permissions */
         if (!(s->s_flags & MS_RDONLY)) {
                 if (!journal_begin(&th, s, 10)) {
@@ -692,7 +726,6 @@ static const struct super_operations reiserfs_sops = {
         .dirty_inode = reiserfs_dirty_inode,
         .evict_inode = reiserfs_evict_inode,
         .put_super = reiserfs_put_super,
-       .write_super = reiserfs_write_super,
         .sync_fs = reiserfs_sync_fs,
         .freeze_fs = reiserfs_freeze,
         .unfreeze_fs = reiserfs_unfreeze,
@@ -1400,7 +1433,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
         err = journal_end(&th, s, 10);
         if (err)
                 goto out_err;
-       s->s_dirt = 0;
  
         if (!(*mount_flags & MS_RDONLY)) {
                 dquot_resume(s, -1);
@@ -1730,19 +1762,21 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                 return -ENOMEM;
         s->s_fs_info = sbi;
         /* Set default values for options: non-aggressive tails, RO on errors */
-       REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
-       REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_ERROR_RO);
-       REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
+       sbi->s_mount_opt |= (1 << REISERFS_SMALLTAIL);
+       sbi->s_mount_opt |= (1 << REISERFS_ERROR_RO);
+       sbi->s_mount_opt |= (1 << REISERFS_BARRIER_FLUSH);
         /* no preallocation minimum, be smart in
            reiserfs_file_write instead */
-       REISERFS_SB(s)->s_alloc_options.preallocmin = 0;
+       sbi->s_alloc_options.preallocmin = 0;
         /* Preallocate by 16 blocks (17-1) at once */
-       REISERFS_SB(s)->s_alloc_options.preallocsize = 17;
+       sbi->s_alloc_options.preallocsize = 17;
         /* setup default block allocator options */
         reiserfs_init_alloc_options(s);
  
-       mutex_init(&REISERFS_SB(s)->lock);
-       REISERFS_SB(s)->lock_depth = -1;
+       spin_lock_init(&sbi->old_work_lock);
+       INIT_DELAYED_WORK(&sbi->old_work, flush_old_commits);
+       mutex_init(&sbi->lock);
+       sbi->lock_depth = -1;
  
         jdev_name = NULL;
         if (reiserfs_parse_options
@@ -1751,8 +1785,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
                 goto error_unlocked;
         }
         if (jdev_name && jdev_name[0]) {
-               REISERFS_SB(s)->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
-               if (!REISERFS_SB(s)->s_jdev) {
+               sbi->s_jdev = kstrdup(jdev_name, GFP_KERNEL);
+               if (!sbi->s_jdev) {
                         SWARN(silent, s, "", "Cannot allocate memory for "
                                 "journal device name");
                         goto error;
@@ -1810,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
         /* make data=ordered the default */
         if (!reiserfs_data_log(s) && !reiserfs_data_ordered(s) &&
             !reiserfs_data_writeback(s)) {
-               REISERFS_SB(s)->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
+               sbi->s_mount_opt |= (1 << REISERFS_DATA_ORDERED);
         }
  
         if (reiserfs_data_log(s)) {
@@ -2003,6 +2037,8 @@ error_unlocked:
                 reiserfs_write_unlock(s);
         }
  
+       cancel_delayed_work_sync(&REISERFS_SB(s)->old_work);
+
         reiserfs_free_bitmap_cache(s);
         if (SB_BUFFER_WITH_SB(s))
                 brelse(SB_BUFFER_WITH_SB(s));
diff --git a/fs/signalfd.c b/fs/signalfd.c

index 7ae2a574cb25a64902128f53832b317202dbee8f..9f35a37173de0de1f7fbd8d80ca8ad39b50e3782 100644 (file)
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -269,12 +269,13 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
                 if (ufd < 0)
                         kfree(ctx);
         } else {
-               struct file *file = fget(ufd);
+               int fput_needed;
+               struct file *file = fget_light(ufd, &fput_needed);
                 if (!file)
                         return -EBADF;
                 ctx = file->private_data;
                 if (file->f_op != &signalfd_fops) {
-                       fput(file);
+                       fput_light(file, fput_needed);
                         return -EINVAL;
                 }
                 spin_lock_irq(&current->sighand->siglock);
@@ -282,7 +283,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask,
                 spin_unlock_irq(&current->sighand->siglock);
  
                 wake_up(&current->sighand->signalfd_wqh);
-               fput(file);
+               fput_light(file, fput_needed);
         }
  
         return ufd;
diff --git a/fs/splice.c b/fs/splice.c

index 406ef2b792c293d709aa164481d20f7ee37b2ed4..c9f1318a3b820b363526576036c4894205552921 100644 (file)
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1003,8 +1003,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                 mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
                 ret = file_remove_suid(out);
                 if (!ret) {
-                       file_update_time(out);
-                       ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+                       ret = file_update_time(out);
+                       if (!ret)
+                               ret = splice_from_pipe_feed(pipe, &sd,
+                                                           pipe_to_file);
                 }
                 mutex_unlock(&inode->i_mutex);
         } while (ret > 0);
diff --git a/fs/statfs.c b/fs/statfs.c

index 43e6b6fe4e855684a197c48ed6bb8dee70f95467..95ad5c0e586c9f64fe492e141387b5092956d553 100644 (file)
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -87,11 +87,12 @@ int user_statfs(const char __user *pathname, struct kstatfs *st)
  
  int fd_statfs(int fd, struct kstatfs *st)
  {
-       struct file *file = fget(fd);
+       int fput_needed;
+       struct file *file = fget_light(fd, &fput_needed);
         int error = -EBADF;
         if (file) {
                 error = vfs_statfs(&file->f_path, st);
-               fput(file);
+               fput_light(file, fput_needed);
         }
         return error;
  }
diff --git a/fs/sync.c b/fs/sync.c

index 0e8db939d96f8fdaa072df7e6fcadb15a5781a84..11e3d1c449018dcf9a95c352746f46d6522c4cb2 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -188,11 +188,12 @@ static int do_fsync(unsigned int fd, int datasync)
  {
         struct file *file;
         int ret = -EBADF;
+       int fput_needed;
  
-       file = fget(fd);
+       file = fget_light(fd, &fput_needed);
         if (file) {
                 ret = vfs_fsync(file, datasync);
-               fput(file);
+               fput_light(file, fput_needed);
         }
         return ret;
  }
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c

index 62a2727f4ecf71809f518dd206922fc7f9234f0d..a6d42efc76d227d62289f852982160442d7e5cea 100644 (file)
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1127,16 +1127,7 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
         struct ubifs_inode *ui = ubifs_inode(inode);
  
         mutex_lock(&ui->ui_mutex);
-       stat->dev = inode->i_sb->s_dev;
-       stat->ino = inode->i_ino;
-       stat->mode = inode->i_mode;
-       stat->nlink = inode->i_nlink;
-       stat->uid = inode->i_uid;
-       stat->gid = inode->i_gid;
-       stat->rdev = inode->i_rdev;
-       stat->atime = inode->i_atime;
-       stat->mtime = inode->i_mtime;
-       stat->ctime = inode->i_ctime;
+       generic_fillattr(inode, stat);
         stat->blksize = UBIFS_BLOCK_SIZE;
         stat->size = ui->ui_size;
  
diff --git a/fs/udf/namei.c b/fs/udf/namei.c

index a165c66e3eef2249379890c60a4c7d4111e8df4d..18024178ac4c040a3f23181ff2dc1a5cc48f2dc8 100644 (file)
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -1260,16 +1260,15 @@ static struct dentry *udf_fh_to_parent(struct super_block *sb,
                                  fid->udf.parent_partref,
                                  fid->udf.parent_generation);
  }
-static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
-                        int connectable)
+static int udf_encode_fh(struct inode *inode, __u32 *fh, int *lenp,
+                        struct inode *parent)
  {
         int len = *lenp;
-       struct inode *inode =  de->d_inode;
         struct kernel_lb_addr location = UDF_I(inode)->i_location;
         struct fid *fid = (struct fid *)fh;
         int type = FILEID_UDF_WITHOUT_PARENT;
  
-       if (connectable && (len < 5)) {
+       if (parent && (len < 5)) {
                 *lenp = 5;
                 return 255;
         } else if (len < 3) {
@@ -1282,14 +1281,11 @@ static int udf_encode_fh(struct dentry *de, __u32 *fh, int *lenp,
         fid->udf.partref = location.partitionReferenceNum;
         fid->udf.generation = inode->i_generation;
  
-       if (connectable && !S_ISDIR(inode->i_mode)) {
-               spin_lock(&de->d_lock);
-               inode = de->d_parent->d_inode;
-               location = UDF_I(inode)->i_location;
+       if (parent) {
+               location = UDF_I(parent)->i_location;
                 fid->udf.parent_block = location.logicalBlockNum;
                 fid->udf.parent_partref = location.partitionReferenceNum;
                 fid->udf.parent_generation = inode->i_generation;
-               spin_unlock(&de->d_lock);
                 *lenp = 5;
                 type = FILEID_UDF_WITH_PARENT;
         }
diff --git a/fs/utimes.c b/fs/utimes.c

index ba653f3dc1bc9c66010290e53e0bb2a5b8fd94b1..fa4dbe451e278eab0f52bbacc110b157a300cdad 100644 (file)
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -140,18 +140,19 @@ long do_utimes(int dfd, const char __user *filename, struct timespec *times,
                 goto out;
  
         if (filename == NULL && dfd != AT_FDCWD) {
+               int fput_needed;
                 struct file *file;
  
                 if (flags & AT_SYMLINK_NOFOLLOW)
                         goto out;
  
-               file = fget(dfd);
+               file = fget_light(dfd, &fput_needed);
                 error = -EBADF;
                 if (!file)
                         goto out;
  
                 error = utimes_common(&file->f_path, times);
-               fput(file);
+               fput_light(file, fput_needed);
         } else {
                 struct path path;
                 int lookup_flags = 0;
diff --git a/fs/xattr.c b/fs/xattr.c

index 3c8c1cc333c7c79dfa105049a62d8b6e1c28661c..1d7ac379045879b827b196f0d7a7420fe33c783d 100644 (file)
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -399,11 +399,12 @@ SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname,
  SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                 const void __user *,value, size_t, size, int, flags)
  {
+       int fput_needed;
         struct file *f;
         struct dentry *dentry;
         int error = -EBADF;
  
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
         if (!f)
                 return error;
         dentry = f->f_path.dentry;
@@ -413,7 +414,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
                 error = setxattr(dentry, name, value, size, flags);
                 mnt_drop_write_file(f);
         }
-       fput(f);
+       fput_light(f, fput_needed);
         return error;
  }
  
@@ -486,15 +487,16 @@ SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname,
  SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name,
                 void __user *, value, size_t, size)
  {
+       int fput_needed;
         struct file *f;
         ssize_t error = -EBADF;
  
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
         if (!f)
                 return error;
         audit_inode(NULL, f->f_path.dentry);
         error = getxattr(f->f_path.dentry, name, value, size);
-       fput(f);
+       fput_light(f, fput_needed);
         return error;
  }
  
@@ -566,15 +568,16 @@ SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list,
  
  SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size)
  {
+       int fput_needed;
         struct file *f;
         ssize_t error = -EBADF;
  
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
         if (!f)
                 return error;
         audit_inode(NULL, f->f_path.dentry);
         error = listxattr(f->f_path.dentry, list, size);
-       fput(f);
+       fput_light(f, fput_needed);
         return error;
  }
  
@@ -634,11 +637,12 @@ SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname,
  
  SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
  {
+       int fput_needed;
         struct file *f;
         struct dentry *dentry;
         int error = -EBADF;
  
-       f = fget(fd);
+       f = fget_light(fd, &fput_needed);
         if (!f)
                 return error;
         dentry = f->f_path.dentry;
@@ -648,7 +652,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
                 error = removexattr(dentry, name);
                 mnt_drop_write_file(f);
         }
-       fput(f);
+       fput_light(f, fput_needed);
         return error;
  }
  
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c

index a907de565db3bf287f7d1a7894fff23f85ca18d5..4a7286c1dc80d270af40a3733870bb9dd769ee82 100644 (file)
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -46,7 +46,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
  }
  
  void *
-kmem_alloc(size_t size, unsigned int __nocast flags)
+kmem_alloc(size_t size, xfs_km_flags_t flags)
  {
         int     retries = 0;
         gfp_t   lflags = kmem_flags_convert(flags);
@@ -65,7 +65,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
  }
  
  void *
-kmem_zalloc(size_t size, unsigned int __nocast flags)
+kmem_zalloc(size_t size, xfs_km_flags_t flags)
  {
         void    *ptr;
  
@@ -87,7 +87,7 @@ kmem_free(const void *ptr)
  
  void *
  kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
-            unsigned int __nocast flags)
+            xfs_km_flags_t flags)
  {
         void    *new;
  
@@ -102,7 +102,7 @@ kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
  }
  
  void *
-kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_alloc(kmem_zone_t *zone, xfs_km_flags_t flags)
  {
         int     retries = 0;
         gfp_t   lflags = kmem_flags_convert(flags);
@@ -121,7 +121,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
  }
  
  void *
-kmem_zone_zalloc(kmem_zone_t *zone, unsigned int __nocast flags)
+kmem_zone_zalloc(kmem_zone_t *zone, xfs_km_flags_t flags)
  {
         void    *ptr;
  
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h

index ab7c53fe346e2273311a1b73bee866ab8c4f8d7f..b2f2620f9a87b9f1bf6836c8faf3d5039e7af94f 100644 (file)
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -27,10 +27,11 @@
   * General memory allocation interfaces
   */
  
-#define KM_SLEEP       0x0001u
-#define KM_NOSLEEP     0x0002u
-#define KM_NOFS                0x0004u
-#define KM_MAYFAIL     0x0008u
+typedef unsigned __bitwise xfs_km_flags_t;
+#define KM_SLEEP       ((__force xfs_km_flags_t)0x0001u)
+#define KM_NOSLEEP     ((__force xfs_km_flags_t)0x0002u)
+#define KM_NOFS                ((__force xfs_km_flags_t)0x0004u)
+#define KM_MAYFAIL     ((__force xfs_km_flags_t)0x0008u)
  
  /*
   * We use a special process flag to avoid recursive callbacks into
@@ -38,7 +39,7 @@
   * warnings, so we explicitly skip any generic ones (silly of us).
   */
  static inline gfp_t
-kmem_flags_convert(unsigned int __nocast flags)
+kmem_flags_convert(xfs_km_flags_t flags)
  {
         gfp_t   lflags;
  
@@ -54,9 +55,9 @@ kmem_flags_convert(unsigned int __nocast flags)
         return lflags;
  }
  
-extern void *kmem_alloc(size_t, unsigned int __nocast);
-extern void *kmem_zalloc(size_t, unsigned int __nocast);
-extern void *kmem_realloc(const void *, size_t, size_t, unsigned int __nocast);
+extern void *kmem_alloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc(size_t, xfs_km_flags_t);
+extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
  extern void  kmem_free(const void *);
  
  static inline void *kmem_zalloc_large(size_t size)
@@ -107,7 +108,7 @@ kmem_zone_destroy(kmem_zone_t *zone)
                 kmem_cache_destroy(zone);
  }
  
-extern void *kmem_zone_alloc(kmem_zone_t *, unsigned int __nocast);
-extern void *kmem_zone_zalloc(kmem_zone_t *, unsigned int __nocast);
+extern void *kmem_zone_alloc(kmem_zone_t *, xfs_km_flags_t);
+extern void *kmem_zone_zalloc(kmem_zone_t *, xfs_km_flags_t);
  
  #endif /* __XFS_SUPPORT_KMEM_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c

index 2d25d19c4ea17b991fa4a43dcbb340e5c957c461..42679223a0fde641e3013980fbd1e733dc6ec60e 100644 (file)
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -52,19 +52,18 @@ static int xfs_fileid_length(int fileid_type)
  
  STATIC int
  xfs_fs_encode_fh(
-       struct dentry           *dentry,
-       __u32                   *fh,
-       int                     *max_len,
-       int                     connectable)
+       struct inode    *inode,
+       __u32           *fh,
+       int             *max_len,
+       struct inode    *parent)
  {
         struct fid              *fid = (struct fid *)fh;
         struct xfs_fid64        *fid64 = (struct xfs_fid64 *)fh;
-       struct inode            *inode = dentry->d_inode;
         int                     fileid_type;
         int                     len;
  
         /* Directories don't need their parent encoded, they have ".." */
-       if (S_ISDIR(inode->i_mode) || !connectable)
+       if (!parent)
                 fileid_type = FILEID_INO32_GEN;
         else
                 fileid_type = FILEID_INO32_GEN_PARENT;
@@ -96,20 +95,16 @@ xfs_fs_encode_fh(
  
         switch (fileid_type) {
         case FILEID_INO32_GEN_PARENT:
-               spin_lock(&dentry->d_lock);
-               fid->i32.parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
-               fid->i32.parent_gen = dentry->d_parent->d_inode->i_generation;
-               spin_unlock(&dentry->d_lock);
+               fid->i32.parent_ino = XFS_I(parent)->i_ino;
+               fid->i32.parent_gen = parent->i_generation;
                 /*FALLTHRU*/
         case FILEID_INO32_GEN:
                 fid->i32.ino = XFS_I(inode)->i_ino;
                 fid->i32.gen = inode->i_generation;
                 break;
         case FILEID_INO32_GEN_PARENT | XFS_FILEID_TYPE_64FLAG:
-               spin_lock(&dentry->d_lock);
-               fid64->parent_ino = XFS_I(dentry->d_parent->d_inode)->i_ino;
-               fid64->parent_gen = dentry->d_parent->d_inode->i_generation;
-               spin_unlock(&dentry->d_lock);
+               fid64->parent_ino = XFS_I(parent)->i_ino;
+               fid64->parent_gen = parent->i_generation;
                 /*FALLTHRU*/
         case FILEID_INO32_GEN | XFS_FILEID_TYPE_64FLAG:
                 fid64->ino = XFS_I(inode)->i_ino;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index 8d214b87f6bb06ed1f7ed204cdfda8da9345172f..9f7ec15a65222e2fe318e0ab81ac9cca0a664b4a 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -586,8 +586,11 @@ restart:
          * lock above.  Eventually we should look into a way to avoid
          * the pointless lock roundtrip.
          */
-       if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-               file_update_time(file);
+       if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+               error = file_update_time(file);
+               if (error)
+                       return error;
+       }
  
         /*
          * If we're writing the file then make sure to clear the setuid and
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index 6b965bf450e44d5972fc689d486deec3dd5c8094..f30d9807dc48a0535084da1afd5a7620389adcd5 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3152,7 +3152,7 @@ xlog_ticket_alloc(
         int             cnt,
         char            client,
         bool            permanent,
-       int             alloc_flags)
+       xfs_km_flags_t  alloc_flags)
  {
         struct xlog_ticket *tic;
         uint            num_headers;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index 735ff1ee53da447eee9c5b88d54e9007ee988138..5bc33261f5be6311fb5732f42db25e17064abded 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -555,7 +555,7 @@ extern void  xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
  extern kmem_zone_t *xfs_log_ticket_zone;
  struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
                                 int count, char client, bool permanent,
-                               int alloc_flags);
+                               xfs_km_flags_t alloc_flags);
  
  
  static inline void
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c

index cdf896fcbfa43810c83bfbc7a99f84f118a4d75d..fdf324508c5ee467c6055f0866e1be88c387942d 100644 (file)
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -584,7 +584,7 @@ xfs_trans_t *
  _xfs_trans_alloc(
         xfs_mount_t     *mp,
         uint            type,
-       uint            memflags)
+       xfs_km_flags_t  memflags)
  {
         xfs_trans_t     *tp;
  
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index 7ab99e1898c8de10e875aff23599d140c9867c4b..7c37b533aa8e5c169f0ef98643f96958df3788df 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -443,7 +443,7 @@ typedef struct xfs_trans {
   * XFS transaction mechanism exported interfaces.
   */
  xfs_trans_t    *xfs_trans_alloc(struct xfs_mount *, uint);
-xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, uint);
+xfs_trans_t    *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t);
  xfs_trans_t    *xfs_trans_dup(xfs_trans_t *);
  int            xfs_trans_reserve(xfs_trans_t *, uint, uint, uint,
                                   uint, uint);
diff --git a/include/asm-generic/posix_types.h b/include/asm-generic/posix_types.h

index 91d44bd4dde32574bb6365a5526ac33a19992050..fe74fccf18db75742d240151ec358049861b7406 100644 (file)
--- a/include/asm-generic/posix_types.h
+++ b/include/asm-generic/posix_types.h
@@ -23,10 +23,6 @@ typedef __kernel_ulong_t __kernel_ino_t;
  typedef unsigned int   __kernel_mode_t;
  #endif
  
-#ifndef __kernel_nlink_t
-typedef __kernel_ulong_t __kernel_nlink_t;
-#endif
-
  #ifndef __kernel_pid_t
  typedef int            __kernel_pid_t;
  #endif
diff --git a/include/linux/errno.h b/include/linux/errno.h

index 2d09bfa5c2628a3e1e396350b7d9d14f1a2791c8..e0de516374da37de6a95c35e79ab7cabb899d177 100644 (file)
--- a/include/linux/errno.h
+++ b/include/linux/errno.h
@@ -17,6 +17,7 @@
  #define ENOIOCTLCMD    515     /* No ioctl command */
  #define ERESTART_RESTARTBLOCK 516 /* restart by calling sys_restart_syscall */
  #define EPROBE_DEFER   517     /* Driver requests probe retry */
+#define EOPENSTALE     518     /* open found a stale dentry */
  
  /* Defined for the NFSv3 protocol */
  #define EBADHANDLE     521     /* Illegal NFS file handle */
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h

index 3a4cef5322dcab4d6b50b96243fa7187b2da1ebd..12291a7ee2759164026ac602ab1712b5d66faef0 100644 (file)
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -165,8 +165,8 @@ struct fid {
   */
  
  struct export_operations {
-       int (*encode_fh)(struct dentry *de, __u32 *fh, int *max_len,
-                       int connectable);
+       int (*encode_fh)(struct inode *inode, __u32 *fh, int *max_len,
+                       struct inode *parent);
         struct dentry * (*fh_to_dentry)(struct super_block *sb, struct fid *fid,
                         int fh_len, int fh_type);
         struct dentry * (*fh_to_parent)(struct super_block *sb, struct fid *fid,
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 40887afaaca7008c3becca920313ffbc555a9fd5..51978ed43e973ccf70996c19c33820fd39edeabe 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1692,6 +1692,7 @@ struct inode_operations {
         int (*removexattr) (struct dentry *, const char *);
         int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                       u64 len);
+       int (*update_time)(struct inode *, struct timespec *, int);
  } ____cacheline_aligned;
  
  struct seq_file;
@@ -1850,6 +1851,13 @@ static inline void inode_inc_iversion(struct inode *inode)
         spin_unlock(&inode->i_lock);
  }
  
+enum file_time_flags {
+       S_ATIME = 1,
+       S_MTIME = 2,
+       S_CTIME = 4,
+       S_VERSION = 8,
+};
+
  extern void touch_atime(struct path *);
  static inline void file_accessed(struct file *file)
  {
@@ -2583,7 +2591,7 @@ extern int inode_change_ok(const struct inode *, struct iattr *);
  extern int inode_newsize_ok(const struct inode *, loff_t offset);
  extern void setattr_copy(struct inode *inode, const struct iattr *attr);
  
-extern void file_update_time(struct file *file);
+extern int file_update_time(struct file *file);
  
  extern int generic_show_options(struct seq_file *m, struct dentry *root);
  extern void save_mount_options(struct super_block *sb, char *options);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h

index 91d0e0a34ef3185a6051d8394cab63dfb76a04cb..63d966d5c2ea7a382c2f42cc664c7804dec86f73 100644 (file)
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -60,7 +60,7 @@
  #define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
                                    FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
                                    FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
-                                  FS_DELETE)
+                                  FS_DELETE | FS_OPEN_PERM | FS_ACCESS_PERM)
  
  #define FS_MOVE                        (FS_MOVED_FROM | FS_MOVED_TO)
  
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h

index 912c30a8ddb1e47cd732fbd95f281238ca601ae0..f334c7fab96762ab4131c9886df87d4d6d4dde9d 100644 (file)
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -31,6 +31,7 @@
  #include <linux/mutex.h>
  #include <linux/timer.h>
  #include <linux/slab.h>
+#include <crypto/hash.h>
  #endif
  
  #define journal_oom_retry 1
@@ -147,12 +148,24 @@ typedef struct journal_header_s
  #define JBD2_CRC32_CHKSUM   1
  #define JBD2_MD5_CHKSUM     2
  #define JBD2_SHA1_CHKSUM    3
+#define JBD2_CRC32C_CHKSUM  4
  
  #define JBD2_CRC32_CHKSUM_SIZE 4
  
  #define JBD2_CHECKSUM_BYTES (32 / sizeof(u32))
  /*
   * Commit block header for storing transactional checksums:
+ *
+ * NOTE: If FEATURE_COMPAT_CHECKSUM (checksum v1) is set, the h_chksum*
+ * fields are used to store a checksum of the descriptor and data blocks.
+ *
+ * If FEATURE_INCOMPAT_CSUM_V2 (checksum v2) is set, then the h_chksum
+ * field is used to store crc32c(uuid+commit_block).  Each journal metadata
+ * block gets its own checksum, and data block checksums are stored in
+ * journal_block_tag (in the descriptor).  The other h_chksum* fields are
+ * not used.
+ *
+ * Checksum v1 and v2 are mutually exclusive features.
   */
  struct commit_header {
         __be32          h_magic;
@@ -175,13 +188,19 @@ struct commit_header {
  typedef struct journal_block_tag_s
  {
         __be32          t_blocknr;      /* The on-disk block number */
-       __be32          t_flags;        /* See below */
+       __be16          t_checksum;     /* truncated crc32c(uuid+seq+block) */
+       __be16          t_flags;        /* See below */
         __be32          t_blocknr_high; /* most-significant high 32bits. */
  } journal_block_tag_t;
  
  #define JBD2_TAG_SIZE32 (offsetof(journal_block_tag_t, t_blocknr_high))
  #define JBD2_TAG_SIZE64 (sizeof(journal_block_tag_t))
  
+/* Tail of descriptor block, for checksumming */
+struct jbd2_journal_block_tail {
+       __be32          t_checksum;     /* crc32c(uuid+descr_block) */
+};
+
  /*
   * The revoke descriptor: used on disk to describe a series of blocks to
   * be revoked from the log
@@ -192,6 +211,10 @@ typedef struct jbd2_journal_revoke_header_s
         __be32           r_count;       /* Count of bytes used in the block */
  } jbd2_journal_revoke_header_t;
  
+/* Tail of revoke block, for checksumming */
+struct jbd2_journal_revoke_tail {
+       __be32          r_checksum;     /* crc32c(uuid+revoke_block) */
+};
  
  /* Definitions for the journal tag flags word: */
  #define JBD2_FLAG_ESCAPE               1       /* on-disk block is escaped */
@@ -241,7 +264,10 @@ typedef struct journal_superblock_s
         __be32  s_max_trans_data;       /* Limit of data blocks per trans. */
  
  /* 0x0050 */
-       __u32   s_padding[44];
+       __u8    s_checksum_type;        /* checksum type */
+       __u8    s_padding2[3];
+       __u32   s_padding[42];
+       __be32  s_checksum;             /* crc32c(superblock) */
  
  /* 0x0100 */
         __u8    s_users[16*48];         /* ids of all fs'es sharing the log */
@@ -263,13 +289,15 @@ typedef struct journal_superblock_s
  #define JBD2_FEATURE_INCOMPAT_REVOKE           0x00000001
  #define JBD2_FEATURE_INCOMPAT_64BIT            0x00000002
  #define JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT     0x00000004
+#define JBD2_FEATURE_INCOMPAT_CSUM_V2          0x00000008
  
  /* Features known to this kernel version: */
  #define JBD2_KNOWN_COMPAT_FEATURES     JBD2_FEATURE_COMPAT_CHECKSUM
  #define JBD2_KNOWN_ROCOMPAT_FEATURES   0
  #define JBD2_KNOWN_INCOMPAT_FEATURES   (JBD2_FEATURE_INCOMPAT_REVOKE | \
                                         JBD2_FEATURE_INCOMPAT_64BIT | \
-                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)
+                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | \
+                                       JBD2_FEATURE_INCOMPAT_CSUM_V2)
  
  #ifdef __KERNEL__
  
@@ -939,6 +967,12 @@ struct journal_s
          * superblock pointer here
          */
         void *j_private;
+
+       /* Reference to checksum algorithm driver via cryptoapi */
+       struct crypto_shash *j_chksum_driver;
+
+       /* Precomputed journal UUID checksum for seeding other checksums */
+       __u32 j_csum_seed;
  };
  
  /*
@@ -1268,6 +1302,25 @@ static inline int jbd_space_needed(journal_t *journal)
  
  extern int jbd_blocks_per_page(struct inode *inode);
  
+static inline u32 jbd2_chksum(journal_t *journal, u32 crc,
+                             const void *address, unsigned int length)
+{
+       struct {
+               struct shash_desc shash;
+               char ctx[crypto_shash_descsize(journal->j_chksum_driver)];
+       } desc;
+       int err;
+
+       desc.shash.tfm = journal->j_chksum_driver;
+       desc.shash.flags = 0;
+       *(u32 *)desc.ctx = crc;
+
+       err = crypto_shash_update(&desc.shash, address, length);
+       BUG_ON(err);
+
+       return *(u32 *)desc.ctx;
+}
+
  #ifdef __KERNEL__
  
  #define buffer_trace_init(bh)  do {} while (0)
diff --git a/include/linux/jbd_common.h b/include/linux/jbd_common.h

index 6230f8556a4eeac37bcaaa83a4cc913177e09c56..6133679bc4c01ace20a0114fd50ff7c3481c7eb9 100644 (file)
--- a/include/linux/jbd_common.h
+++ b/include/linux/jbd_common.h
@@ -12,6 +12,7 @@ enum jbd_state_bits {
         BH_State,               /* Pins most journal_head state */
         BH_JournalHead,         /* Pins bh->b_private and jh->b_bh */
         BH_Unshadow,            /* Dummy bit, for BJ_Shadow wakeup filtering */
+       BH_Verified,            /* Metadata block has been verified ok */
         BH_JBDPrivateStart,     /* First bit available for private use by FS */
  };
  
@@ -24,6 +25,7 @@ TAS_BUFFER_FNS(Revoked, revoked)
  BUFFER_FNS(RevokeValid, revokevalid)
  TAS_BUFFER_FNS(RevokeValid, revokevalid)
  BUFFER_FNS(Freed, freed)
+BUFFER_FNS(Verified, verified)
  
  static inline struct buffer_head *jh2bh(struct journal_head *jh)
  {
diff --git a/include/linux/lglock.h b/include/linux/lglock.h

index 87f402ccec55567330943ab774ffb12ae21c7da8..f01e5f6d1f07a4966927bb7acd5707f8f77904c8 100644 (file)
--- a/include/linux/lglock.h
+++ b/include/linux/lglock.h
@@ -23,28 +23,17 @@
  #include <linux/lockdep.h>
  #include <linux/percpu.h>
  #include <linux/cpu.h>
+#include <linux/notifier.h>
  
  /* can make br locks by using local lock for read side, global lock for write */
-#define br_lock_init(name)     name##_lock_init()
-#define br_read_lock(name)     name##_local_lock()
-#define br_read_unlock(name)   name##_local_unlock()
-#define br_write_lock(name)    name##_global_lock_online()
-#define br_write_unlock(name)  name##_global_unlock_online()
+#define br_lock_init(name)     lg_lock_init(name, #name)
+#define br_read_lock(name)     lg_local_lock(name)
+#define br_read_unlock(name)   lg_local_unlock(name)
+#define br_write_lock(name)    lg_global_lock(name)
+#define br_write_unlock(name)  lg_global_unlock(name)
  
-#define DECLARE_BRLOCK(name)   DECLARE_LGLOCK(name)
  #define DEFINE_BRLOCK(name)    DEFINE_LGLOCK(name)
  
-
-#define lg_lock_init(name)     name##_lock_init()
-#define lg_local_lock(name)    name##_local_lock()
-#define lg_local_unlock(name)  name##_local_unlock()
-#define lg_local_lock_cpu(name, cpu)   name##_local_lock_cpu(cpu)
-#define lg_local_unlock_cpu(name, cpu) name##_local_unlock_cpu(cpu)
-#define lg_global_lock(name)   name##_global_lock()
-#define lg_global_unlock(name) name##_global_unlock()
-#define lg_global_lock_online(name) name##_global_lock_online()
-#define lg_global_unlock_online(name) name##_global_unlock_online()
-
  #ifdef CONFIG_DEBUG_LOCK_ALLOC
  #define LOCKDEP_INIT_MAP lockdep_init_map
  
@@ -59,142 +48,26 @@
  #define DEFINE_LGLOCK_LOCKDEP(name)
  #endif
  
-
-#define DECLARE_LGLOCK(name)                                           \
- extern void name##_lock_init(void);                                   \
- extern void name##_local_lock(void);                                  \
- extern void name##_local_unlock(void);                                        \
- extern void name##_local_lock_cpu(int cpu);                           \
- extern void name##_local_unlock_cpu(int cpu);                         \
- extern void name##_global_lock(void);                                 \
- extern void name##_global_unlock(void);                               \
- extern void name##_global_lock_online(void);                          \
- extern void name##_global_unlock_online(void);                                \
+struct lglock {
+       arch_spinlock_t __percpu *lock;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+       struct lock_class_key lock_key;
+       struct lockdep_map    lock_dep_map;
+#endif
+};
  
  #define DEFINE_LGLOCK(name)                                            \
-                                                                       \
- DEFINE_SPINLOCK(name##_cpu_lock);                                     \
- cpumask_t name##_cpus __read_mostly;                                  \
- DEFINE_PER_CPU(arch_spinlock_t, name##_lock);                         \
- DEFINE_LGLOCK_LOCKDEP(name);                                          \
-                                                                       \
- static int                                                            \
- name##_lg_cpu_callback(struct notifier_block *nb,                     \
-                               unsigned long action, void *hcpu)       \
- {                                                                     \
-       switch (action & ~CPU_TASKS_FROZEN) {                           \
-       case CPU_UP_PREPARE:                                            \
-               spin_lock(&name##_cpu_lock);                            \
-               cpu_set((unsigned long)hcpu, name##_cpus);              \
-               spin_unlock(&name##_cpu_lock);                          \
-               break;                                                  \
-       case CPU_UP_CANCELED: case CPU_DEAD:                            \
-               spin_lock(&name##_cpu_lock);                            \
-               cpu_clear((unsigned long)hcpu, name##_cpus);            \
-               spin_unlock(&name##_cpu_lock);                          \
-       }                                                               \
-       return NOTIFY_OK;                                               \
- }                                                                     \
- static struct notifier_block name##_lg_cpu_notifier = {               \
-       .notifier_call = name##_lg_cpu_callback,                        \
- };                                                                    \
- void name##_lock_init(void) {                                         \
-       int i;                                                          \
-       LOCKDEP_INIT_MAP(&name##_lock_dep_map, #name, &name##_lock_key, 0); \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               *lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;     \
-       }                                                               \
-       register_hotcpu_notifier(&name##_lg_cpu_notifier);              \
-       get_online_cpus();                                              \
-       for_each_online_cpu(i)                                          \
-               cpu_set(i, name##_cpus);                                \
-       put_online_cpus();                                              \
- }                                                                     \
- EXPORT_SYMBOL(name##_lock_init);                                      \
-                                                                       \
- void name##_local_lock(void) {                                                \
-       arch_spinlock_t *lock;                                          \
-       preempt_disable();                                              \
-       rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
-       lock = &__get_cpu_var(name##_lock);                             \
-       arch_spin_lock(lock);                                           \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_lock);                                     \
-                                                                       \
- void name##_local_unlock(void) {                                      \
-       arch_spinlock_t *lock;                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
-       lock = &__get_cpu_var(name##_lock);                             \
-       arch_spin_unlock(lock);                                         \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_unlock);                                   \
-                                                                       \
- void name##_local_lock_cpu(int cpu) {                                 \
-       arch_spinlock_t *lock;                                          \
-       preempt_disable();                                              \
-       rwlock_acquire_read(&name##_lock_dep_map, 0, 0, _THIS_IP_);     \
-       lock = &per_cpu(name##_lock, cpu);                              \
-       arch_spin_lock(lock);                                           \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_lock_cpu);                                 \
-                                                                       \
- void name##_local_unlock_cpu(int cpu) {                               \
-       arch_spinlock_t *lock;                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _THIS_IP_);             \
-       lock = &per_cpu(name##_lock, cpu);                              \
-       arch_spin_unlock(lock);                                         \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_local_unlock_cpu);                               \
-                                                                       \
- void name##_global_lock_online(void) {                                        \
-       int i;                                                          \
-       spin_lock(&name##_cpu_lock);                                    \
-       rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
-       for_each_cpu(i, &name##_cpus) {                                 \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_lock(lock);                                   \
-       }                                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_lock_online);                             \
-                                                                       \
- void name##_global_unlock_online(void) {                              \
-       int i;                                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
-       for_each_cpu(i, &name##_cpus) {                                 \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_unlock(lock);                                 \
-       }                                                               \
-       spin_unlock(&name##_cpu_lock);                                  \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_unlock_online);                           \
-                                                                       \
- void name##_global_lock(void) {                                       \
-       int i;                                                          \
-       preempt_disable();                                              \
-       rwlock_acquire(&name##_lock_dep_map, 0, 0, _RET_IP_);           \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_lock(lock);                                   \
-       }                                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_lock);                                    \
-                                                                       \
- void name##_global_unlock(void) {                                     \
-       int i;                                                          \
-       rwlock_release(&name##_lock_dep_map, 1, _RET_IP_);              \
-       for_each_possible_cpu(i) {                                      \
-               arch_spinlock_t *lock;                                  \
-               lock = &per_cpu(name##_lock, i);                        \
-               arch_spin_unlock(lock);                                 \
-       }                                                               \
-       preempt_enable();                                               \
- }                                                                     \
- EXPORT_SYMBOL(name##_global_unlock);
+       DEFINE_LGLOCK_LOCKDEP(name);                                    \
+       DEFINE_PER_CPU(arch_spinlock_t, name ## _lock)                  \
+       = __ARCH_SPIN_LOCK_UNLOCKED;                                    \
+       struct lglock name = { .lock = &name ## _lock }
+
+void lg_lock_init(struct lglock *lg, char *name);
+void lg_local_lock(struct lglock *lg);
+void lg_local_unlock(struct lglock *lg);
+void lg_local_lock_cpu(struct lglock *lg, int cpu);
+void lg_local_unlock_cpu(struct lglock *lg, int cpu);
+void lg_global_lock(struct lglock *lg);
+void lg_global_unlock(struct lglock *lg);
+
  #endif
diff --git a/include/linux/mm.h b/include/linux/mm.h

index ce26716238c3632ba95d909bf69e1e4d4bc91da2..b36d08ce5c578dcd18e224828217ded481de54ee 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1392,7 +1392,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo
  extern unsigned long mmap_region(struct file *file, unsigned long addr,
         unsigned long len, unsigned long flags,
         vm_flags_t vm_flags, unsigned long pgoff);
-extern unsigned long do_mmap(struct file *, unsigned long,
+extern unsigned long do_mmap_pgoff(struct file *, unsigned long,
          unsigned long, unsigned long,
          unsigned long, unsigned long);
  extern int do_munmap(struct mm_struct *, unsigned long, size_t);
diff --git a/include/linux/security.h b/include/linux/security.h

index ab0e091ce5facf0047c57191f9e631fd5c4bb791..4e5a73cdbbef18463920022626931d02c0540eb9 100644 (file)
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -86,9 +86,9 @@ extern int cap_inode_setxattr(struct dentry *dentry, const char *name,
  extern int cap_inode_removexattr(struct dentry *dentry, const char *name);
  extern int cap_inode_need_killpriv(struct dentry *dentry);
  extern int cap_inode_killpriv(struct dentry *dentry);
-extern int cap_file_mmap(struct file *file, unsigned long reqprot,
-                        unsigned long prot, unsigned long flags,
-                        unsigned long addr, unsigned long addr_only);
+extern int cap_mmap_addr(unsigned long addr);
+extern int cap_mmap_file(struct file *file, unsigned long reqprot,
+                        unsigned long prot, unsigned long flags);
  extern int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags);
  extern int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3,
                           unsigned long arg4, unsigned long arg5);
@@ -586,15 +586,17 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts)
   *     simple integer value.  When @arg represents a user space pointer, it
   *     should never be used by the security module.
   *     Return 0 if permission is granted.
- * @file_mmap :
+ * @mmap_addr :
+ *     Check permissions for a mmap operation at @addr.
+ *     @addr contains virtual address that will be used for the operation.
+ *     Return 0 if permission is granted.
+ * @mmap_file :
   *     Check permissions for a mmap operation.  The @file may be NULL, e.g.
   *     if mapping anonymous memory.
   *     @file contains the file structure for file to map (may be NULL).
   *     @reqprot contains the protection requested by the application.
   *     @prot contains the protection that will be applied by the kernel.
   *     @flags contains the operational flags.
- *     @addr contains virtual address that will be used for the operation.
- *     @addr_only contains a boolean: 0 if file-backed VMA, otherwise 1.
   *     Return 0 if permission is granted.
   * @file_mprotect:
   *     Check permissions before changing memory access permissions.
@@ -1481,10 +1483,10 @@ struct security_operations {
         void (*file_free_security) (struct file *file);
         int (*file_ioctl) (struct file *file, unsigned int cmd,
                            unsigned long arg);
-       int (*file_mmap) (struct file *file,
+       int (*mmap_addr) (unsigned long addr);
+       int (*mmap_file) (struct file *file,
                           unsigned long reqprot, unsigned long prot,
-                         unsigned long flags, unsigned long addr,
-                         unsigned long addr_only);
+                         unsigned long flags);
         int (*file_mprotect) (struct vm_area_struct *vma,
                               unsigned long reqprot,
                               unsigned long prot);
@@ -1743,9 +1745,9 @@ int security_file_permission(struct file *file, int mask);
  int security_file_alloc(struct file *file);
  void security_file_free(struct file *file);
  int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int security_file_mmap(struct file *file, unsigned long reqprot,
-                       unsigned long prot, unsigned long flags,
-                       unsigned long addr, unsigned long addr_only);
+int security_mmap_file(struct file *file, unsigned long prot,
+                       unsigned long flags);
+int security_mmap_addr(unsigned long addr);
  int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                            unsigned long prot);
  int security_file_lock(struct file *file, unsigned int cmd);
@@ -2181,13 +2183,15 @@ static inline int security_file_ioctl(struct file *file, unsigned int cmd,
         return 0;
  }
  
-static inline int security_file_mmap(struct file *file, unsigned long reqprot,
-                                    unsigned long prot,
-                                    unsigned long flags,
-                                    unsigned long addr,
-                                    unsigned long addr_only)
+static inline int security_mmap_file(struct file *file, unsigned long prot,
+                                    unsigned long flags)
+{
+       return 0;
+}
+
+static inline int security_mmap_addr(unsigned long addr)
  {
-       return cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
+       return cap_mmap_addr(addr);
  }
  
  static inline int security_file_mprotect(struct vm_area_struct *vma,
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h

index 51b29ac45a8e7b26583df0217ab37a0d939ad6da..40e0a273faea3c07470e19fd23673fda89543f9b 100644 (file)
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -232,7 +232,6 @@ struct svc_rqst {
         struct svc_pool *       rq_pool;        /* thread pool */
         struct svc_procedure *  rq_procinfo;    /* procedure info */
         struct auth_ops *       rq_authop;      /* authentication flavour */
-       u32                     rq_flavor;      /* pseudoflavor */
         struct svc_cred         rq_cred;        /* auth info */
         void *                  rq_xprt_ctxt;   /* transport specific context ptr */
         struct svc_deferred_req*rq_deferred;    /* deferred request we are replaying */
@@ -416,6 +415,7 @@ struct svc_procedure {
   */
  int svc_rpcb_setup(struct svc_serv *serv, struct net *net);
  void svc_rpcb_cleanup(struct svc_serv *serv, struct net *net);
+int svc_bind(struct svc_serv *serv, struct net *net);
  struct svc_serv *svc_create(struct svc_program *, unsigned int,
                             void (*shutdown)(struct svc_serv *, struct net *net));
  struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
diff --git a/include/linux/sunrpc/svcauth.h b/include/linux/sunrpc/svcauth.h

index 2c54683b91decae417967b5a0703c96d84de9714..dd74084a9799891309f54db25b8259ae3388f3c8 100644 (file)
--- a/include/linux/sunrpc/svcauth.h
+++ b/include/linux/sunrpc/svcauth.h
@@ -15,13 +15,23 @@
  #include <linux/sunrpc/msg_prot.h>
  #include <linux/sunrpc/cache.h>
  #include <linux/hash.h>
+#include <linux/cred.h>
  
  struct svc_cred {
         uid_t                   cr_uid;
         gid_t                   cr_gid;
         struct group_info       *cr_group_info;
+       u32                     cr_flavor; /* pseudoflavor */
+       char                    *cr_principal; /* for gss */
  };
  
+static inline void free_svc_cred(struct svc_cred *cred)
+{
+       if (cred->cr_group_info)
+               put_group_info(cred->cr_group_info);
+       kfree(cred->cr_principal);
+}
+
  struct svc_rqst;               /* forward decl */
  struct in6_addr;
  
diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h

index 7c32daa025eb07b644d8185a27c8ea10d8b7c55f..726aff1a52011fcdfd3ab1e11b8a82ff1dbea703 100644 (file)
--- a/include/linux/sunrpc/svcauth_gss.h
+++ b/include/linux/sunrpc/svcauth_gss.h
@@ -22,7 +22,6 @@ int gss_svc_init_net(struct net *net);
  void gss_svc_shutdown_net(struct net *net);
  int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name);
  u32 svcauth_gss_flavor(struct auth_domain *dom);
-char *svc_gss_principal(struct svc_rqst *);
  
  #endif /* __KERNEL__ */
  #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */
diff --git a/include/linux/types.h b/include/linux/types.h

index 7f480db60231a714b9e520f3a16856c5d4e4a5e1..9c1bd539ea70e780e0e926b54bfc9320d3ec34a4 100644 (file)
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -25,7 +25,7 @@ typedef __kernel_dev_t                dev_t;
  typedef __kernel_ino_t         ino_t;
  typedef __kernel_mode_t                mode_t;
  typedef unsigned short         umode_t;
-typedef __kernel_nlink_t       nlink_t;
+typedef __u32                  nlink_t;
  typedef __kernel_off_t         off_t;
  typedef __kernel_pid_t         pid_t;
  typedef __kernel_daddr_t       daddr_t;
diff --git a/ipc/shm.c b/ipc/shm.c

index 406c5b208193373b979ce82bffe6617250ea64ed..5e2cbfdab6fc0d6b96a19c321a9208dda8cd130d 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1036,6 +1036,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
         sfd->file = shp->shm_file;
         sfd->vm_ops = NULL;
  
+       err = security_mmap_file(file, prot, flags);
+       if (err)
+               goto out_fput;
+
         down_write(&current->mm->mmap_sem);
         if (addr && !(shmflg & SHM_REMAP)) {
                 err = -EINVAL;
@@ -1050,7 +1054,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
                         goto invalid;
         }
                 
-       user_addr = do_mmap (file, addr, size, prot, flags, 0);
+       user_addr = do_mmap_pgoff(file, addr, size, prot, flags, 0);
         *raddr = user_addr;
         err = 0;
         if (IS_ERR_VALUE(user_addr))
@@ -1058,6 +1062,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
  invalid:
         up_write(&current->mm->mmap_sem);
  
+out_fput:
         fput(file);
  
  out_nattch:
diff --git a/kernel/Makefile b/kernel/Makefile

index 6f3d0ae044b24769c1caa61346174bc44a3684bc..c0cc67ad764ceddbe9f226ee1bfb90c4055f19ff 100644 (file)
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,7 +10,7 @@ obj-y     = fork.o exec_domain.o panic.o printk.o \
             kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
             hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
             notifier.o ksysfs.o cred.o \
-           async.o range.o groups.o
+           async.o range.o groups.o lglock.o
  
  ifdef CONFIG_FUNCTION_TRACER
  # Do not trace debug files and internal ftrace files
diff --git a/kernel/lglock.c b/kernel/lglock.c

new file mode 100644 (file)

index 0000000..6535a66
--- /dev/null
+++ b/kernel/lglock.c
@@ -0,0 +1,89 @@
+/* See include/linux/lglock.h for description */
+#include <linux/module.h>
+#include <linux/lglock.h>
+#include <linux/cpu.h>
+#include <linux/string.h>
+
+/*
+ * Note there is no uninit, so lglocks cannot be defined in
+ * modules (but it's fine to use them from there)
+ * Could be added though, just undo lg_lock_init
+ */
+
+void lg_lock_init(struct lglock *lg, char *name)
+{
+       LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
+}
+EXPORT_SYMBOL(lg_lock_init);
+
+void lg_local_lock(struct lglock *lg)
+{
+       arch_spinlock_t *lock;
+
+       preempt_disable();
+       rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       lock = this_cpu_ptr(lg->lock);
+       arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock);
+
+void lg_local_unlock(struct lglock *lg)
+{
+       arch_spinlock_t *lock;
+
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       lock = this_cpu_ptr(lg->lock);
+       arch_spin_unlock(lock);
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock);
+
+void lg_local_lock_cpu(struct lglock *lg, int cpu)
+{
+       arch_spinlock_t *lock;
+
+       preempt_disable();
+       rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       lock = per_cpu_ptr(lg->lock, cpu);
+       arch_spin_lock(lock);
+}
+EXPORT_SYMBOL(lg_local_lock_cpu);
+
+void lg_local_unlock_cpu(struct lglock *lg, int cpu)
+{
+       arch_spinlock_t *lock;
+
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       lock = per_cpu_ptr(lg->lock, cpu);
+       arch_spin_unlock(lock);
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_local_unlock_cpu);
+
+void lg_global_lock(struct lglock *lg)
+{
+       int i;
+
+       preempt_disable();
+       rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_);
+       for_each_possible_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_lock(lock);
+       }
+}
+EXPORT_SYMBOL(lg_global_lock);
+
+void lg_global_unlock(struct lglock *lg)
+{
+       int i;
+
+       rwlock_release(&lg->lock_dep_map, 1, _RET_IP_);
+       for_each_possible_cpu(i) {
+               arch_spinlock_t *lock;
+               lock = per_cpu_ptr(lg->lock, i);
+               arch_spin_unlock(lock);
+       }
+       preempt_enable();
+}
+EXPORT_SYMBOL(lg_global_unlock);
diff --git a/mm/cleancache.c b/mm/cleancache.c

index 5646c740f613ed1ec8b34a094f76d7934eed1aac..32e6f4136fa2297e13a6ac51444d50c18b78e9a3 100644 (file)
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -80,7 +80,7 @@ EXPORT_SYMBOL(__cleancache_init_shared_fs);
  static int cleancache_get_key(struct inode *inode,
                               struct cleancache_filekey *key)
  {
-       int (*fhfn)(struct dentry *, __u32 *fh, int *, int);
+       int (*fhfn)(struct inode *, __u32 *fh, int *, struct inode *);
         int len = 0, maxlen = CLEANCACHE_KEY_MAX;
         struct super_block *sb = inode->i_sb;
  
@@ -88,9 +88,7 @@ static int cleancache_get_key(struct inode *inode,
         if (sb->s_export_op != NULL) {
                 fhfn = sb->s_export_op->encode_fh;
                 if  (fhfn) {
-                       struct dentry d;
-                       d.d_inode = inode;
-                       len = (*fhfn)(&d, &key->u.fh[0], &maxlen, 0);
+                       len = (*fhfn)(inode, &key->u.fh[0], &maxlen, NULL);
                         if (len <= 0 || len == 255)
                                 return -1;
                         if (maxlen > CLEANCACHE_KEY_MAX)
diff --git a/mm/filemap.c b/mm/filemap.c

index 64b48f934b897451154e5517b38704fb19e4f86a..a4a5260b0279b77b37738540b1e8c24fb446a3e5 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1899,71 +1899,6 @@ struct page *read_cache_page(struct address_space *mapping,
  }
  EXPORT_SYMBOL(read_cache_page);
  
-/*
- * The logic we want is
- *
- *     if suid or (sgid and xgrp)
- *             remove privs
- */
-int should_remove_suid(struct dentry *dentry)
-{
-       umode_t mode = dentry->d_inode->i_mode;
-       int kill = 0;
-
-       /* suid always must be killed */
-       if (unlikely(mode & S_ISUID))
-               kill = ATTR_KILL_SUID;
-
-       /*
-        * sgid without any exec bits is just a mandatory locking mark; leave
-        * it alone.  If some exec bits are set, it's a real sgid; kill it.
-        */
-       if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
-               kill |= ATTR_KILL_SGID;
-
-       if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
-               return kill;
-
-       return 0;
-}
-EXPORT_SYMBOL(should_remove_suid);
-
-static int __remove_suid(struct dentry *dentry, int kill)
-{
-       struct iattr newattrs;
-
-       newattrs.ia_valid = ATTR_FORCE | kill;
-       return notify_change(dentry, &newattrs);
-}
-
-int file_remove_suid(struct file *file)
-{
-       struct dentry *dentry = file->f_path.dentry;
-       struct inode *inode = dentry->d_inode;
-       int killsuid;
-       int killpriv;
-       int error = 0;
-
-       /* Fast path for nothing security related */
-       if (IS_NOSEC(inode))
-               return 0;
-
-       killsuid = should_remove_suid(dentry);
-       killpriv = security_inode_need_killpriv(dentry);
-
-       if (killpriv < 0)
-               return killpriv;
-       if (killpriv)
-               error = security_inode_killpriv(dentry);
-       if (!error && killsuid)
-               error = __remove_suid(dentry, killsuid);
-       if (!error && (inode->i_sb->s_flags & MS_NOSEC))
-               inode->i_flags |= S_NOSEC;
-
-       return error;
-}
-EXPORT_SYMBOL(file_remove_suid);
-
  static size_t __iovec_copy_from_user_inatomic(char *vaddr,
                         const struct iovec *iov, size_t base, size_t bytes)
  {
@@ -2489,7 +2424,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
         if (err)
                 goto out;
  
-       file_update_time(file);
+       err = file_update_time(file);
+       if (err)
+               goto out;
  
         /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
         if (unlikely(file->f_flags & O_DIRECT)) {
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c

index a4eb3113222912c9aada14bd92c6b68d01577b73..213ca1f5340980e1ce6fad8d4f12e50858d61397 100644 (file)
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -426,7 +426,9 @@ xip_file_write(struct file *filp, const char __user *buf, size_t len,
         if (ret)
                 goto out_backing;
  
-       file_update_time(filp);
+       ret = file_update_time(filp);
+       if (ret)
+               goto out_backing;
  
         ret = __xip_file_write (filp, buf, count, pos, ppos);
  
diff --git a/mm/internal.h b/mm/internal.h

index 4194ab9dc19b412aa8e15f1b89612aa2595f0c9f..5cbb78190041573ee4e92e77a12a8f129514862f 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -350,3 +350,7 @@ extern u64 hwpoison_filter_flags_mask;
  extern u64 hwpoison_filter_flags_value;
  extern u64 hwpoison_filter_memcg;
  extern u32 hwpoison_filter_enable;
+
+extern unsigned long vm_mmap_pgoff(struct file *, unsigned long,
+        unsigned long, unsigned long,
+        unsigned long, unsigned long);
diff --git a/mm/mmap.c b/mm/mmap.c

index 4a9c2a391e28efe523e1a3aaaf9ab06d24627e86..3edfcdfa42d9f27a5238780065220ec3b4fc702a 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -971,15 +971,13 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
   * The caller must hold down_write(&current->mm->mmap_sem).
   */
  
-static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                         unsigned long len, unsigned long prot,
                         unsigned long flags, unsigned long pgoff)
  {
         struct mm_struct * mm = current->mm;
         struct inode *inode;
         vm_flags_t vm_flags;
-       int error;
-       unsigned long reqprot = prot;
  
         /*
          * Does the application expect PROT_READ to imply PROT_EXEC?
@@ -1101,39 +1099,9 @@ static unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
                 }
         }
  
-       error = security_file_mmap(file, reqprot, prot, flags, addr, 0);
-       if (error)
-               return error;
-
         return mmap_region(file, addr, len, flags, vm_flags, pgoff);
  }
  
-unsigned long do_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       if (unlikely(offset + PAGE_ALIGN(len) < offset))
-               return -EINVAL;
-       if (unlikely(offset & ~PAGE_MASK))
-               return -EINVAL;
-       return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(do_mmap);
-
-unsigned long vm_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       unsigned long ret;
-       struct mm_struct *mm = current->mm;
-
-       down_write(&mm->mmap_sem);
-       ret = do_mmap(file, addr, len, prot, flag, offset);
-       up_write(&mm->mmap_sem);
-       return ret;
-}
-EXPORT_SYMBOL(vm_mmap);
-
  SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                 unsigned long, prot, unsigned long, flags,
                 unsigned long, fd, unsigned long, pgoff)
@@ -1165,10 +1133,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
  
         flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
  
-       down_write(&current->mm->mmap_sem);
-       retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-       up_write(&current->mm->mmap_sem);
-
+       retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
         if (file)
                 fput(file);
  out:
@@ -1629,7 +1594,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
         if (addr & ~PAGE_MASK)
                 return -EINVAL;
  
-       return arch_rebalance_pgtables(addr, len);
+       addr = arch_rebalance_pgtables(addr, len);
+       error = security_mmap_addr(addr);
+       return error ? error : addr;
  }
  
  EXPORT_SYMBOL(get_unmapped_area);
@@ -1819,7 +1786,7 @@ int expand_downwards(struct vm_area_struct *vma,
                 return -ENOMEM;
  
         address &= PAGE_MASK;
-       error = security_file_mmap(NULL, 0, 0, 0, address, 1);
+       error = security_mmap_addr(address);
         if (error)
                 return error;
  
@@ -2159,7 +2126,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
  
         return 0;
  }
-EXPORT_SYMBOL(do_munmap);
  
  int vm_munmap(unsigned long start, size_t len)
  {
@@ -2207,10 +2173,6 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
         if (!len)
                 return addr;
  
-       error = security_file_mmap(NULL, 0, 0, 0, addr, 1);
-       if (error)
-               return error;
-
         flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
  
         error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
@@ -2563,10 +2525,6 @@ int install_special_mapping(struct mm_struct *mm,
         vma->vm_ops = &special_mapping_vmops;
         vma->vm_private_data = pages;
  
-       ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
-       if (ret)
-               goto out;
-
         ret = insert_vm_struct(mm, vma);
         if (ret)
                 goto out;
diff --git a/mm/mremap.c b/mm/mremap.c

index db8d983b5a7d7a2d6746ccbf74471d2ced3cdd96..21fed202ddad865bb3ee70d07d8ebce17fd37493 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -371,10 +371,6 @@ static unsigned long mremap_to(unsigned long addr,
         if ((addr <= new_addr) && (addr+old_len) > new_addr)
                 goto out;
  
-       ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-       if (ret)
-               goto out;
-
         ret = do_munmap(mm, new_addr, new_len);
         if (ret)
                 goto out;
@@ -432,15 +428,17 @@ static int vma_expandable(struct vm_area_struct *vma, unsigned long delta)
   * MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
   * This option implies MREMAP_MAYMOVE.
   */
-unsigned long do_mremap(unsigned long addr,
-       unsigned long old_len, unsigned long new_len,
-       unsigned long flags, unsigned long new_addr)
+SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
+               unsigned long, new_len, unsigned long, flags,
+               unsigned long, new_addr)
  {
         struct mm_struct *mm = current->mm;
         struct vm_area_struct *vma;
         unsigned long ret = -EINVAL;
         unsigned long charged = 0;
  
+       down_write(&current->mm->mmap_sem);
+
         if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
                 goto out;
  
@@ -530,25 +528,11 @@ unsigned long do_mremap(unsigned long addr,
                         goto out;
                 }
  
-               ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1);
-               if (ret)
-                       goto out;
                 ret = move_vma(vma, addr, old_len, new_len, new_addr);
         }
  out:
         if (ret & ~PAGE_MASK)
                 vm_unacct_memory(charged);
-       return ret;
-}
-
-SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
-               unsigned long, new_len, unsigned long, flags,
-               unsigned long, new_addr)
-{
-       unsigned long ret;
-
-       down_write(&current->mm->mmap_sem);
-       ret = do_mremap(addr, old_len, new_len, flags, new_addr);
         up_write(&current->mm->mmap_sem);
         return ret;
  }
diff --git a/mm/nommu.c b/mm/nommu.c

index bb8f4f004a82ce57abb0653a9a8ed72d533f5c45..c4acfbc099727b3f5151ed5917961ff59efb7481 100644 (file)
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -889,7 +889,6 @@ static int validate_mmap_request(struct file *file,
                                  unsigned long *_capabilities)
  {
         unsigned long capabilities, rlen;
-       unsigned long reqprot = prot;
         int ret;
  
         /* do the simple checks first */
@@ -1047,7 +1046,7 @@ static int validate_mmap_request(struct file *file,
         }
  
         /* allow the security API to have its say */
-       ret = security_file_mmap(file, reqprot, prot, flags, addr, 0);
+       ret = security_mmap_addr(addr);
         if (ret < 0)
                 return ret;
  
@@ -1233,7 +1232,7 @@ enomem:
  /*
   * handle mapping creation for uClinux
   */
-static unsigned long do_mmap_pgoff(struct file *file,
+unsigned long do_mmap_pgoff(struct file *file,
                             unsigned long addr,
                             unsigned long len,
                             unsigned long prot,
@@ -1471,32 +1470,6 @@ error_getting_region:
         return -ENOMEM;
  }
  
-unsigned long do_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       if (unlikely(offset + PAGE_ALIGN(len) < offset))
-               return -EINVAL;
-       if (unlikely(offset & ~PAGE_MASK))
-               return -EINVAL;
-       return do_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
-}
-EXPORT_SYMBOL(do_mmap);
-
-unsigned long vm_mmap(struct file *file, unsigned long addr,
-       unsigned long len, unsigned long prot,
-       unsigned long flag, unsigned long offset)
-{
-       unsigned long ret;
-       struct mm_struct *mm = current->mm;
-
-       down_write(&mm->mmap_sem);
-       ret = do_mmap(file, addr, len, prot, flag, offset);
-       up_write(&mm->mmap_sem);
-       return ret;
-}
-EXPORT_SYMBOL(vm_mmap);
-
  SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
                 unsigned long, prot, unsigned long, flags,
                 unsigned long, fd, unsigned long, pgoff)
@@ -1513,9 +1486,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
  
         flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
  
-       down_write(&current->mm->mmap_sem);
-       retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
-       up_write(&current->mm->mmap_sem);
+       ret = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
  
         if (file)
                 fput(file);
diff --git a/mm/shmem.c b/mm/shmem.c

index d576b84d913c40c89232b5709360b6e4abce4dad..585bd220a21ee4e5eefaec2b9c46dc9ae68fde98 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2439,11 +2439,9 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb,
         return dentry;
  }
  
-static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len,
-                               int connectable)
+static int shmem_encode_fh(struct inode *inode, __u32 *fh, int *len,
+                               struct inode *parent)
  {
-       struct inode *inode = dentry->d_inode;
-
         if (*len < 3) {
                 *len = 3;
                 return 255;
diff --git a/mm/util.c b/mm/util.c

index ae962b31de888a55990769aae948bac3ef0db338..8c7265afa29f2109b884907daa050b79f0b25f8b 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
  #include <linux/export.h>
  #include <linux/err.h>
  #include <linux/sched.h>
+#include <linux/security.h>
  #include <asm/uaccess.h>
  
  #include "internal.h"
@@ -341,6 +342,35 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
  }
  EXPORT_SYMBOL_GPL(get_user_pages_fast);
  
+unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot,
+       unsigned long flag, unsigned long pgoff)
+{
+       unsigned long ret;
+       struct mm_struct *mm = current->mm;
+
+       ret = security_mmap_file(file, prot, flag);
+       if (!ret) {
+               down_write(&mm->mmap_sem);
+               ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff);
+               up_write(&mm->mmap_sem);
+       }
+       return ret;
+}
+
+unsigned long vm_mmap(struct file *file, unsigned long addr,
+       unsigned long len, unsigned long prot,
+       unsigned long flag, unsigned long offset)
+{
+       if (unlikely(offset + PAGE_ALIGN(len) < offset))
+               return -EINVAL;
+       if (unlikely(offset & ~PAGE_MASK))
+               return -EINVAL;
+
+       return vm_mmap_pgoff(file, addr, len, prot, flag, offset >> PAGE_SHIFT);
+}
+EXPORT_SYMBOL(vm_mmap);
+
  /* Tracepoints definitions. */
  EXPORT_TRACEPOINT_SYMBOL(kmalloc);
  EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c

index 8522a4793374136fa4ab66aa9b325b48019801ac..ca8e0a57d945dabeb51147f338cef363672040d5 100644 (file)
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -16,8 +16,6 @@
  #include <net/netlink.h>
  #include <net/pkt_sched.h>
  
-extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */
-
  /*
   * The ATM queuing discipline provides a framework for invoking classifiers
   * (aka "filters"), which in turn select classes of this queuing discipline.
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c

index 38f388c39dce89a5e6456514771f70ef975af1c0..107c4528654fd5867b8363ccdf66c648e9202a34 100644 (file)
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -381,21 +381,53 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
  }
  
  /*
- * We cannot currently handle tokens with rotated data.  We need a
- * generalized routine to rotate the data in place.  It is anticipated
- * that we won't encounter rotated data in the general case.
+ * We can shift data by up to LOCAL_BUF_LEN bytes in a pass.  If we need
+ * to do more than that, we shift repeatedly.  Kevin Coffman reports
+ * seeing 28 bytes as the value used by Microsoft clients and servers
+ * with AES, so this constant is chosen to allow handling 28 in one pass
+ * without using too much stack space.
+ *
+ * If that proves to a problem perhaps we could use a more clever
+ * algorithm.
   */
-static u32
-rotate_left(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf, u16 rrc)
+#define LOCAL_BUF_LEN 32u
+
+static void rotate_buf_a_little(struct xdr_buf *buf, unsigned int shift)
  {
-       unsigned int realrrc = rrc % (buf->len - offset - GSS_KRB5_TOK_HDR_LEN);
+       char head[LOCAL_BUF_LEN];
+       char tmp[LOCAL_BUF_LEN];
+       unsigned int this_len, i;
+
+       BUG_ON(shift > LOCAL_BUF_LEN);
  
-       if (realrrc == 0)
-               return 0;
+       read_bytes_from_xdr_buf(buf, 0, head, shift);
+       for (i = 0; i + shift < buf->len; i += LOCAL_BUF_LEN) {
+               this_len = min(LOCAL_BUF_LEN, buf->len - (i + shift));
+               read_bytes_from_xdr_buf(buf, i+shift, tmp, this_len);
+               write_bytes_to_xdr_buf(buf, i, tmp, this_len);
+       }
+       write_bytes_to_xdr_buf(buf, buf->len - shift, head, shift);
+}
  
-       dprintk("%s: cannot process token with rotated data: "
-               "rrc %u, realrrc %u\n", __func__, rrc, realrrc);
-       return 1;
+static void _rotate_left(struct xdr_buf *buf, unsigned int shift)
+{
+       int shifted = 0;
+       int this_shift;
+
+       shift %= buf->len;
+       while (shifted < shift) {
+               this_shift = min(shift - shifted, LOCAL_BUF_LEN);
+               rotate_buf_a_little(buf, this_shift);
+               shifted += this_shift;
+       }
+}
+
+static void rotate_left(u32 base, struct xdr_buf *buf, unsigned int shift)
+{
+       struct xdr_buf subbuf;
+
+       xdr_buf_subsegment(buf, &subbuf, base, buf->len - base);
+       _rotate_left(&subbuf, shift);
  }
  
  static u32
@@ -495,11 +527,8 @@ gss_unwrap_kerberos_v2(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
  
         seqnum = be64_to_cpup((__be64 *)(ptr + 8));
  
-       if (rrc != 0) {
-               err = rotate_left(kctx, offset, buf, rrc);
-               if (err)
-                       return GSS_S_FAILURE;
-       }
+       if (rrc != 0)
+               rotate_left(offset + 16, buf, rrc);
  
         err = (*kctx->gk5e->decrypt_v2)(kctx, offset, buf,
                                         &headskip, &tailskip);
diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c

index 3089de37c433157cd45cff21974e0a242743c715..73e95738660042e7a9d4e7cb252143ec74078265 100644 (file)
--- a/net/sunrpc/auth_gss/svcauth_gss.c
+++ b/net/sunrpc/auth_gss/svcauth_gss.c
@@ -336,7 +336,6 @@ struct rsc {
         struct svc_cred         cred;
         struct gss_svc_seq_data seqdata;
         struct gss_ctx          *mechctx;
-       char                    *client_name;
  };
  
  static struct rsc *rsc_update(struct cache_detail *cd, struct rsc *new, struct rsc *old);
@@ -347,9 +346,7 @@ static void rsc_free(struct rsc *rsci)
         kfree(rsci->handle.data);
         if (rsci->mechctx)
                 gss_delete_sec_context(&rsci->mechctx);
-       if (rsci->cred.cr_group_info)
-               put_group_info(rsci->cred.cr_group_info);
-       kfree(rsci->client_name);
+       free_svc_cred(&rsci->cred);
  }
  
  static void rsc_put(struct kref *ref)
@@ -387,7 +384,7 @@ rsc_init(struct cache_head *cnew, struct cache_head *ctmp)
         tmp->handle.data = NULL;
         new->mechctx = NULL;
         new->cred.cr_group_info = NULL;
-       new->client_name = NULL;
+       new->cred.cr_principal = NULL;
  }
  
  static void
@@ -402,8 +399,8 @@ update_rsc(struct cache_head *cnew, struct cache_head *ctmp)
         spin_lock_init(&new->seqdata.sd_lock);
         new->cred = tmp->cred;
         tmp->cred.cr_group_info = NULL;
-       new->client_name = tmp->client_name;
-       tmp->client_name = NULL;
+       new->cred.cr_principal = tmp->cred.cr_principal;
+       tmp->cred.cr_principal = NULL;
  }
  
  static struct cache_head *
@@ -501,8 +498,8 @@ static int rsc_parse(struct cache_detail *cd,
                 /* get client name */
                 len = qword_get(&mesg, buf, mlen);
                 if (len > 0) {
-                       rsci.client_name = kstrdup(buf, GFP_KERNEL);
-                       if (!rsci.client_name)
+                       rsci.cred.cr_principal = kstrdup(buf, GFP_KERNEL);
+                       if (!rsci.cred.cr_principal)
                                 goto out;
                 }
  
@@ -932,16 +929,6 @@ struct gss_svc_data {
         struct rsc                      *rsci;
  };
  
-char *svc_gss_principal(struct svc_rqst *rqstp)
-{
-       struct gss_svc_data *gd = (struct gss_svc_data *)rqstp->rq_auth_data;
-
-       if (gd && gd->rsci)
-               return gd->rsci->client_name;
-       return NULL;
-}
-EXPORT_SYMBOL_GPL(svc_gss_principal);
-
  static int
  svcauth_gss_set_client(struct svc_rqst *rqstp)
  {
@@ -1220,7 +1207,7 @@ svcauth_gss_accept(struct svc_rqst *rqstp, __be32 *authp)
                 }
                 svcdata->rsci = rsci;
                 cache_get(&rsci->h);
-               rqstp->rq_flavor = gss_svc_to_pseudoflavor(
+               rqstp->rq_cred.cr_flavor = gss_svc_to_pseudoflavor(
                                         rsci->mechctx->mech_type, gc->gc_svc);
                 ret = SVC_OK;
                 goto out;
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c

index 3c0653439f3dc398031301d53819c3b6e78bc4ef..92509ffe15fcacce5de331cbb205a84c4f718a86 100644 (file)
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -180,14 +180,16 @@ void rpcb_put_local(struct net *net)
         struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
         struct rpc_clnt *clnt = sn->rpcb_local_clnt;
         struct rpc_clnt *clnt4 = sn->rpcb_local_clnt4;
-       int shutdown;
+       int shutdown = 0;
  
         spin_lock(&sn->rpcb_clnt_lock);
-       if (--sn->rpcb_users == 0) {
-               sn->rpcb_local_clnt = NULL;
-               sn->rpcb_local_clnt4 = NULL;
+       if (sn->rpcb_users) {
+               if (--sn->rpcb_users == 0) {
+                       sn->rpcb_local_clnt = NULL;
+                       sn->rpcb_local_clnt4 = NULL;
+               }
+               shutdown = !sn->rpcb_users;
         }
-       shutdown = !sn->rpcb_users;
         spin_unlock(&sn->rpcb_clnt_lock);
  
         if (shutdown) {
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c

index 017c0117d1543a784dfe5130396c74f80879131a..7e9baaa1e543e55878dcb0d9bd0378a0e51754e0 100644 (file)
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -407,6 +407,14 @@ static int svc_uses_rpcbind(struct svc_serv *serv)
         return 0;
  }
  
+int svc_bind(struct svc_serv *serv, struct net *net)
+{
+       if (!svc_uses_rpcbind(serv))
+               return 0;
+       return svc_rpcb_setup(serv, net);
+}
+EXPORT_SYMBOL_GPL(svc_bind);
+
  /*
   * Create an RPC service
   */
@@ -471,15 +479,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
                 spin_lock_init(&pool->sp_lock);
         }
  
-       if (svc_uses_rpcbind(serv)) {
-               if (svc_rpcb_setup(serv, current->nsproxy->net_ns) < 0) {
-                       kfree(serv->sv_pools);
-                       kfree(serv);
-                       return NULL;
-               }
-               if (!serv->sv_shutdown)
-                       serv->sv_shutdown = svc_rpcb_cleanup;
-       }
+       if (svc_uses_rpcbind(serv) && (!serv->sv_shutdown))
+               serv->sv_shutdown = svc_rpcb_cleanup;
  
         return serv;
  }
@@ -536,8 +537,6 @@ EXPORT_SYMBOL_GPL(svc_shutdown_net);
  void
  svc_destroy(struct svc_serv *serv)
  {
-       struct net *net = current->nsproxy->net_ns;
-
         dprintk("svc: svc_destroy(%s, %d)\n",
                                 serv->sv_program->pg_name,
                                 serv->sv_nrthreads);
@@ -552,8 +551,6 @@ svc_destroy(struct svc_serv *serv)
  
         del_timer_sync(&serv->sv_temptimer);
  
-       svc_shutdown_net(serv, net);
-
         /*
          * The last user is gone and thus all sockets have to be destroyed to
          * the point. Check this.
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c

index b98ee35149121602b42ace9365bfd5f21e84767a..88f2bf671960d444e73d3d9eba2998f75ac2885b 100644 (file)
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -598,6 +598,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
  
         /* now allocate needed pages.  If we get a failure, sleep briefly */
         pages = (serv->sv_max_mesg + PAGE_SIZE) / PAGE_SIZE;
+       BUG_ON(pages >= RPCSVC_MAXPAGES);
         for (i = 0; i < pages ; i++)
                 while (rqstp->rq_pages[i] == NULL) {
                         struct page *p = alloc_page(GFP_KERNEL);
@@ -612,7 +613,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout)
                         rqstp->rq_pages[i] = p;
                 }
         rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
-       BUG_ON(pages >= RPCSVC_MAXPAGES);
  
         /* Make arg->head point to first page and arg->pages point to rest */
         arg = &rqstp->rq_arg;
@@ -973,7 +973,7 @@ void svc_close_net(struct svc_serv *serv, struct net *net)
         svc_clear_pools(serv, net);
         /*
          * At this point the sp_sockets lists will stay empty, since
-        * svc_enqueue will not add new entries without taking the
+        * svc_xprt_enqueue will not add new entries without taking the
          * sp_lock and checking XPT_BUSY.
          */
         svc_clear_list(&serv->sv_tempsocks, net);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c

index 6138c925923d00715cb8695ebd2cfbb719287699..2777fa896645de3f063aa5ad67cb054bbb75a894 100644 (file)
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -746,6 +746,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
         struct svc_cred *cred = &rqstp->rq_cred;
  
         cred->cr_group_info = NULL;
+       cred->cr_principal = NULL;
         rqstp->rq_client = NULL;
  
         if (argv->iov_len < 3*4)
@@ -773,7 +774,7 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
         svc_putnl(resv, RPC_AUTH_NULL);
         svc_putnl(resv, 0);
  
-       rqstp->rq_flavor = RPC_AUTH_NULL;
+       rqstp->rq_cred.cr_flavor = RPC_AUTH_NULL;
         return SVC_OK;
  }
  
@@ -811,6 +812,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
         int             len   = argv->iov_len;
  
         cred->cr_group_info = NULL;
+       cred->cr_principal = NULL;
         rqstp->rq_client = NULL;
  
         if ((len -= 3*4) < 0)
@@ -847,7 +849,7 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
         svc_putnl(resv, RPC_AUTH_NULL);
         svc_putnl(resv, 0);
  
-       rqstp->rq_flavor = RPC_AUTH_UNIX;
+       rqstp->rq_cred.cr_flavor = RPC_AUTH_UNIX;
         return SVC_OK;
  
  badcred:
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c

index 032daab449b0bb3007562e795593a15d247a2c58..8ea39aabe94889a224c868757196afc084c578b8 100644 (file)
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -490,17 +490,9 @@ static int common_mmap(int op, struct file *file, unsigned long prot,
         return common_file_perm(op, file, mask);
  }
  
-static int apparmor_file_mmap(struct file *file, unsigned long reqprot,
-                             unsigned long prot, unsigned long flags,
-                             unsigned long addr, unsigned long addr_only)
+static int apparmor_mmap_file(struct file *file, unsigned long reqprot,
+                             unsigned long prot, unsigned long flags)
  {
-       int rc = 0;
-
-       /* do DAC check */
-       rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
-       if (rc || addr_only)
-               return rc;
-
         return common_mmap(OP_FMMAP, file, prot, flags);
  }
  
@@ -646,7 +638,8 @@ static struct security_operations apparmor_ops = {
         .file_permission =              apparmor_file_permission,
         .file_alloc_security =          apparmor_file_alloc_security,
         .file_free_security =           apparmor_file_free_security,
-       .file_mmap =                    apparmor_file_mmap,
+       .mmap_file =                    apparmor_mmap_file,
+       .mmap_addr =                    cap_mmap_addr,
         .file_mprotect =                apparmor_file_mprotect,
         .file_lock =                    apparmor_file_lock,
  
diff --git a/security/capability.c b/security/capability.c

index fca889676c5e9e5726f6c3136fb8c26cfe85f63c..61095df8b89ac452d50528144a67dca751d4f992 100644 (file)
--- a/security/capability.c
+++ b/security/capability.c
@@ -949,7 +949,8 @@ void __init security_fixup_ops(struct security_operations *ops)
         set_to_cap_if_null(ops, file_alloc_security);
         set_to_cap_if_null(ops, file_free_security);
         set_to_cap_if_null(ops, file_ioctl);
-       set_to_cap_if_null(ops, file_mmap);
+       set_to_cap_if_null(ops, mmap_addr);
+       set_to_cap_if_null(ops, mmap_file);
         set_to_cap_if_null(ops, file_mprotect);
         set_to_cap_if_null(ops, file_lock);
         set_to_cap_if_null(ops, file_fcntl);
diff --git a/security/commoncap.c b/security/commoncap.c

index e771cb1b2d7947f0c85651b38cc7c9c1d3da11d7..6dbae4650abe20208ff66eb27015e21b964d0344 100644 (file)
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -958,22 +958,15 @@ int cap_vm_enough_memory(struct mm_struct *mm, long pages)
  }
  
  /*
- * cap_file_mmap - check if able to map given addr
- * @file: unused
- * @reqprot: unused
- * @prot: unused
- * @flags: unused
+ * cap_mmap_addr - check if able to map given addr
   * @addr: address attempting to be mapped
- * @addr_only: unused
   *
   * If the process is attempting to map memory below dac_mmap_min_addr they need
   * CAP_SYS_RAWIO.  The other parameters to this function are unused by the
   * capability security module.  Returns 0 if this mapping should be allowed
   * -EPERM if not.
   */
-int cap_file_mmap(struct file *file, unsigned long reqprot,
-                 unsigned long prot, unsigned long flags,
-                 unsigned long addr, unsigned long addr_only)
+int cap_mmap_addr(unsigned long addr)
  {
         int ret = 0;
  
@@ -986,3 +979,9 @@ int cap_file_mmap(struct file *file, unsigned long reqprot,
         }
         return ret;
  }
+
+int cap_mmap_file(struct file *file, unsigned long reqprot,
+                 unsigned long prot, unsigned long flags)
+{
+       return 0;
+}
diff --git a/security/security.c b/security/security.c

index 5497a57fba0154a24b1b87835930e6cc685f855b..3efc9b12aef44016201b02eeafcc10140f17a240 100644 (file)
--- a/security/security.c
+++ b/security/security.c
@@ -20,6 +20,9 @@
  #include <linux/ima.h>
  #include <linux/evm.h>
  #include <linux/fsnotify.h>
+#include <linux/mman.h>
+#include <linux/mount.h>
+#include <linux/personality.h>
  #include <net/flow.h>
  
  #define MAX_LSM_EVM_XATTR      2
@@ -657,18 +660,56 @@ int security_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         return security_ops->file_ioctl(file, cmd, arg);
  }
  
-int security_file_mmap(struct file *file, unsigned long reqprot,
-                       unsigned long prot, unsigned long flags,
-                       unsigned long addr, unsigned long addr_only)
+static inline unsigned long mmap_prot(struct file *file, unsigned long prot)
  {
-       int ret;
+       /*
+        * Does we have PROT_READ and does the application expect
+        * it to imply PROT_EXEC?  If not, nothing to talk about...
+        */
+       if ((prot & (PROT_READ | PROT_EXEC)) != PROT_READ)
+               return prot;
+       if (!(current->personality & READ_IMPLIES_EXEC))
+               return prot;
+       /*
+        * if that's an anonymous mapping, let it.
+        */
+       if (!file)
+               return prot | PROT_EXEC;
+       /*
+        * ditto if it's not on noexec mount, except that on !MMU we need
+        * BDI_CAP_EXEC_MMAP (== VM_MAYEXEC) in this case
+        */
+       if (!(file->f_path.mnt->mnt_flags & MNT_NOEXEC)) {
+#ifndef CONFIG_MMU
+               unsigned long caps = 0;
+               struct address_space *mapping = file->f_mapping;
+               if (mapping && mapping->backing_dev_info)
+                       caps = mapping->backing_dev_info->capabilities;
+               if (!(caps & BDI_CAP_EXEC_MAP))
+                       return prot;
+#endif
+               return prot | PROT_EXEC;
+       }
+       /* anything on noexec mount won't get PROT_EXEC */
+       return prot;
+}
  
-       ret = security_ops->file_mmap(file, reqprot, prot, flags, addr, addr_only);
+int security_mmap_file(struct file *file, unsigned long prot,
+                       unsigned long flags)
+{
+       int ret;
+       ret = security_ops->mmap_file(file, prot,
+                                       mmap_prot(file, prot), flags);
         if (ret)
                 return ret;
         return ima_file_mmap(file, prot);
  }
  
+int security_mmap_addr(unsigned long addr)
+{
+       return security_ops->mmap_addr(addr);
+}
+
  int security_file_mprotect(struct vm_area_struct *vma, unsigned long reqprot,
                             unsigned long prot)
  {
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c

index fa2341b683314b0c5505f905e6712538555300ad..372ec6502aa8752dca83c3c507e2d0ce9cac84d1 100644 (file)
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -3083,9 +3083,7 @@ error:
         return rc;
  }
  
-static int selinux_file_mmap(struct file *file, unsigned long reqprot,
-                            unsigned long prot, unsigned long flags,
-                            unsigned long addr, unsigned long addr_only)
+static int selinux_mmap_addr(unsigned long addr)
  {
         int rc = 0;
         u32 sid = current_sid();
@@ -3104,10 +3102,12 @@ static int selinux_file_mmap(struct file *file, unsigned long reqprot,
         }
  
         /* do DAC check on address space usage */
-       rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
-       if (rc || addr_only)
-               return rc;
+       return cap_mmap_addr(addr);
+}
  
+static int selinux_mmap_file(struct file *file, unsigned long reqprot,
+                            unsigned long prot, unsigned long flags)
+{
         if (selinux_checkreqprot)
                 prot = reqprot;
  
@@ -5570,7 +5570,8 @@ static struct security_operations selinux_ops = {
         .file_alloc_security =          selinux_file_alloc_security,
         .file_free_security =           selinux_file_free_security,
         .file_ioctl =                   selinux_file_ioctl,
-       .file_mmap =                    selinux_file_mmap,
+       .mmap_file =                    selinux_mmap_file,
+       .mmap_addr =                    selinux_mmap_addr,
         .file_mprotect =                selinux_file_mprotect,
         .file_lock =                    selinux_file_lock,
         .file_fcntl =                   selinux_file_fcntl,
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c

index 4e93f9ef970b25a78bca26ab2a49962024b3cb50..3ad2902512888282e299b64434a7790d9788e060 100644 (file)
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -1259,12 +1259,8 @@ static int sel_make_bools(void)
                 if (!inode)
                         goto out;
  
-               ret = -EINVAL;
-               len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]);
-               if (len < 0)
-                       goto out;
-
                 ret = -ENAMETOOLONG;
+               len = snprintf(page, PAGE_SIZE, "/%s/%s", BOOL_DIR_NAME, names[i]);
                 if (len >= PAGE_SIZE)
                         goto out;
  
@@ -1557,19 +1553,10 @@ static inline u32 sel_ino_to_perm(unsigned long ino)
  static ssize_t sel_read_class(struct file *file, char __user *buf,
                                 size_t count, loff_t *ppos)
  {
-       ssize_t rc, len;
-       char *page;
         unsigned long ino = file->f_path.dentry->d_inode->i_ino;
-
-       page = (char *)__get_free_page(GFP_KERNEL);
-       if (!page)
-               return -ENOMEM;
-
-       len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_class(ino));
-       rc = simple_read_from_buffer(buf, count, ppos, page, len);
-       free_page((unsigned long)page);
-
-       return rc;
+       char res[TMPBUFLEN];
+       ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_class(ino));
+       return simple_read_from_buffer(buf, count, ppos, res, len);
  }
  
  static const struct file_operations sel_class_ops = {
@@ -1580,19 +1567,10 @@ static const struct file_operations sel_class_ops = {
  static ssize_t sel_read_perm(struct file *file, char __user *buf,
                                 size_t count, loff_t *ppos)
  {
-       ssize_t rc, len;
-       char *page;
         unsigned long ino = file->f_path.dentry->d_inode->i_ino;
-
-       page = (char *)__get_free_page(GFP_KERNEL);
-       if (!page)
-               return -ENOMEM;
-
-       len = snprintf(page, PAGE_SIZE, "%d", sel_ino_to_perm(ino));
-       rc = simple_read_from_buffer(buf, count, ppos, page, len);
-       free_page((unsigned long)page);
-
-       return rc;
+       char res[TMPBUFLEN];
+       ssize_t len = snprintf(res, sizeof(res), "%d", sel_ino_to_perm(ino));
+       return simple_read_from_buffer(buf, count, ppos, res, len);
  }
  
  static const struct file_operations sel_perm_ops = {
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c

index d583c054580889eff6f4e9080110aad7ae0370d1..ee0bb5735f35c98d6edfa7bb4c9590ef2a234cc4 100644 (file)
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -1171,7 +1171,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
  }
  
  /**
- * smack_file_mmap :
+ * smack_mmap_file :
   * Check permissions for a mmap operation.  The @file may be NULL, e.g.
   * if mapping anonymous memory.
   * @file contains the file structure for file to map (may be NULL).
@@ -1180,10 +1180,9 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
   * @flags contains the operational flags.
   * Return 0 if permission is granted.
   */
-static int smack_file_mmap(struct file *file,
+static int smack_mmap_file(struct file *file,
                            unsigned long reqprot, unsigned long prot,
-                          unsigned long flags, unsigned long addr,
-                          unsigned long addr_only)
+                          unsigned long flags)
  {
         struct smack_known *skp;
         struct smack_rule *srp;
@@ -1198,11 +1197,6 @@ static int smack_file_mmap(struct file *file,
         int tmay;
         int rc;
  
-       /* do DAC check on address space usage */
-       rc = cap_file_mmap(file, reqprot, prot, flags, addr, addr_only);
-       if (rc || addr_only)
-               return rc;
-
         if (file == NULL || file->f_dentry == NULL)
                 return 0;
  
@@ -3482,7 +3476,8 @@ struct security_operations smack_ops = {
         .file_ioctl =                   smack_file_ioctl,
         .file_lock =                    smack_file_lock,
         .file_fcntl =                   smack_file_fcntl,
-       .file_mmap =                    smack_file_mmap,
+       .mmap_file =                    smack_mmap_file,
+       .mmap_addr =                    cap_mmap_addr,
         .file_set_fowner =              smack_file_set_fowner,
         .file_send_sigiotask =          smack_file_send_sigiotask,
         .file_receive =                 smack_file_receive,
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 1 Jun 2012 18:53:44 +0000 (11:53 -0700)
Documentation/filesystems/Locking		patch \| blob \| history
Documentation/filesystems/vfs.txt		patch \| blob \| history
arch/alpha/include/asm/posix_types.h		patch \| blob \| history
arch/arm/include/asm/posix_types.h		patch \| blob \| history
arch/avr32/include/asm/posix_types.h		patch \| blob \| history
arch/blackfin/include/asm/posix_types.h		patch \| blob \| history
arch/cris/include/asm/posix_types.h		patch \| blob \| history
arch/frv/include/asm/posix_types.h		patch \| blob \| history
arch/h8300/include/asm/posix_types.h		patch \| blob \| history
arch/ia64/include/asm/posix_types.h		patch \| blob \| history
arch/ia64/kernel/perfmon.c		patch \| blob \| history
arch/ia64/kernel/sys_ia64.c		patch \| blob \| history
arch/m32r/include/asm/posix_types.h		patch \| blob \| history
arch/m68k/include/asm/posix_types.h		patch \| blob \| history
arch/mips/include/asm/posix_types.h		patch \| blob \| history
arch/mips/include/asm/stat.h		patch \| blob \| history
arch/mn10300/include/asm/posix_types.h		patch \| blob \| history
arch/parisc/include/asm/posix_types.h		patch \| blob \| history
arch/parisc/include/asm/stat.h		patch \| blob \| history
arch/powerpc/include/asm/posix_types.h		patch \| blob \| history
arch/powerpc/include/asm/stat.h		patch \| blob \| history
arch/s390/include/asm/posix_types.h		patch \| blob \| history
arch/sh/include/asm/posix_types_32.h		patch \| blob \| history
arch/sh/include/asm/posix_types_64.h		patch \| blob \| history
arch/sparc/include/asm/posix_types.h		patch \| blob \| history
arch/sparc/kernel/sys_sparc_64.c		patch \| blob \| history
arch/tile/include/asm/compat.h		patch \| blob \| history
arch/x86/include/asm/posix_types_32.h		patch \| blob \| history
drivers/base/soc.c		patch \| blob \| history
drivers/gpu/drm/i810/i810_dma.c		patch \| blob \| history
fs/9p/vfs_inode_dotl.c		patch \| blob \| history
fs/affs/affs.h		patch \| blob \| history
fs/aio.c		patch \| blob \| history
fs/attr.c		patch \| blob \| history
fs/binfmt_elf.c		patch \| blob \| history
fs/binfmt_flat.c		patch \| blob \| history
fs/btrfs/acl.c		patch \| blob \| history
fs/btrfs/backref.c		patch \| blob \| history
fs/btrfs/backref.h		patch \| blob \| history
fs/btrfs/btrfs_inode.h		patch \| blob \| history
fs/btrfs/check-integrity.c		patch \| blob \| history
fs/btrfs/ctree.c		patch \| blob \| history
fs/btrfs/ctree.h		patch \| blob \| history
fs/btrfs/delayed-inode.c		patch \| blob \| history
fs/btrfs/delayed-ref.c		patch \| blob \| history
fs/btrfs/delayed-ref.h		patch \| blob \| history
fs/btrfs/disk-io.c		patch \| blob \| history
fs/btrfs/disk-io.h		patch \| blob \| history
fs/btrfs/export.c		patch \| blob \| history
fs/btrfs/extent-tree.c		patch \| blob \| history
fs/btrfs/extent_io.c		patch \| blob \| history
fs/btrfs/extent_io.h		patch \| blob \| history
fs/btrfs/file.c		patch \| blob \| history
fs/btrfs/free-space-cache.c		patch \| blob \| history
fs/btrfs/inode.c		patch \| blob \| history
fs/btrfs/ioctl.c		patch \| blob \| history
fs/btrfs/ioctl.h		patch \| blob \| history
fs/btrfs/ordered-data.c		patch \| blob \| history
fs/btrfs/ordered-data.h		patch \| blob \| history
fs/btrfs/print-tree.c		patch \| blob \| history
fs/btrfs/reada.c		patch \| blob \| history
fs/btrfs/scrub.c		patch \| blob \| history
fs/btrfs/super.c		patch \| blob \| history
fs/btrfs/transaction.c		patch \| blob \| history
fs/btrfs/tree-log.c		patch \| blob \| history
fs/btrfs/ulist.c		patch \| blob \| history
fs/btrfs/ulist.h		patch \| blob \| history
fs/btrfs/volumes.c		patch \| blob \| history
fs/btrfs/volumes.h		patch \| blob \| history
fs/btrfs/xattr.c		patch \| blob \| history
fs/buffer.c		patch \| blob \| history
fs/ceph/export.c		patch \| blob \| history
fs/compat.c		patch \| blob \| history
fs/dcache.c		patch \| blob \| history
fs/ecryptfs/inode.c		patch \| blob \| history
fs/exec.c		patch \| blob \| history
fs/exportfs/expfs.c		patch \| blob \| history
fs/ext4/Kconfig		patch \| blob \| history
fs/ext4/balloc.c		patch \| blob \| history
fs/ext4/bitmap.c		patch \| blob \| history
fs/ext4/dir.c		patch \| blob \| history
fs/ext4/ext4.h		patch \| blob \| history
fs/ext4/ext4_extents.h		patch \| blob \| history
fs/ext4/ext4_jbd2.c		patch \| blob \| history
fs/ext4/ext4_jbd2.h		patch \| blob \| history
fs/ext4/extents.c		patch \| blob \| history
fs/ext4/file.c		patch \| blob \| history
fs/ext4/ialloc.c		patch \| blob \| history
fs/ext4/inode.c		patch \| blob \| history
fs/ext4/ioctl.c		patch \| blob \| history
fs/ext4/mballoc.c		patch \| blob \| history
fs/ext4/mmp.c		patch \| blob \| history
fs/ext4/namei.c		patch \| blob \| history
fs/ext4/resize.c		patch \| blob \| history
fs/ext4/super.c		patch \| blob \| history
fs/ext4/xattr.c		patch \| blob \| history
fs/ext4/xattr.h		patch \| blob \| history
fs/fat/inode.c		patch \| blob \| history
fs/fcntl.c		patch \| blob \| history
fs/file_table.c		patch \| blob \| history
fs/fuse/file.c		patch \| blob \| history
fs/fuse/inode.c		patch \| blob \| history
fs/gfs2/export.c		patch \| blob \| history
fs/hpfs/alloc.c		patch \| blob \| history
fs/hpfs/anode.c		patch \| blob \| history
fs/hpfs/dir.c		patch \| blob \| history
fs/hpfs/dnode.c		patch \| blob \| history
fs/hpfs/ea.c		patch \| blob \| history
fs/hpfs/hpfs.h		patch \| blob \| history
fs/hpfs/hpfs_fn.h		patch \| blob \| history
fs/hpfs/inode.c		patch \| blob \| history
fs/hpfs/map.c		patch \| blob \| history
fs/hpfs/namei.c		patch \| blob \| history
fs/hpfs/super.c		patch \| blob \| history
fs/inode.c		patch \| blob \| history
fs/internal.h		patch \| blob \| history
fs/isofs/export.c		patch \| blob \| history
fs/jbd2/Kconfig		patch \| blob \| history
fs/jbd2/commit.c		patch \| blob \| history
fs/jbd2/journal.c		patch \| blob \| history
fs/jbd2/recovery.c		patch \| blob \| history
fs/jbd2/revoke.c		patch \| blob \| history
fs/jbd2/transaction.c		patch \| blob \| history
fs/jffs2/jffs2_fs_sb.h		patch \| blob \| history
fs/jffs2/os-linux.h		patch \| blob \| history
fs/jffs2/super.c		patch \| blob \| history
fs/jffs2/wbuf.c		patch \| blob \| history
fs/lockd/svc.c		patch \| blob \| history
fs/locks.c		patch \| blob \| history
fs/namei.c		patch \| blob \| history
fs/namespace.c		patch \| blob \| history
fs/ncpfs/file.c		patch \| blob \| history
fs/ncpfs/ncp_fs_sb.h		patch \| blob \| history
fs/nfs/callback.c		patch \| blob \| history
fs/nfs/dir.c		patch \| blob \| history
fs/nfs/file.c		patch \| blob \| history
fs/nfsd/auth.c		patch \| blob \| history
fs/nfsd/export.c		patch \| blob \| history
fs/nfsd/fault_inject.c		patch \| blob \| history
fs/nfsd/nfs4callback.c		patch \| blob \| history
fs/nfsd/nfs4idmap.c		patch \| blob \| history
fs/nfsd/nfs4recover.c		patch \| blob \| history
fs/nfsd/nfs4state.c		patch \| blob \| history
fs/nfsd/nfs4xdr.c		patch \| blob \| history
fs/nfsd/nfsctl.c		patch \| blob \| history
fs/nfsd/nfssvc.c		patch \| blob \| history
fs/nfsd/state.h		patch \| blob \| history
fs/nfsd/xdr4.h		patch \| blob \| history
fs/nilfs2/namei.c		patch \| blob \| history
fs/notify/fsnotify.c		patch \| blob \| history
fs/ntfs/file.c		patch \| blob \| history
fs/ocfs2/blockcheck.c		patch \| blob \| history
fs/ocfs2/dlm/dlmast.c		patch \| blob \| history
fs/ocfs2/dlm/dlmcommon.h		patch \| blob \| history
fs/ocfs2/dlm/dlmdomain.c		patch \| blob \| history
fs/ocfs2/export.c		patch \| blob \| history
fs/ocfs2/inode.c		patch \| blob \| history
fs/ocfs2/ioctl.c		patch \| blob \| history
fs/ocfs2/move_extents.c		patch \| blob \| history
fs/ocfs2/namei.c		patch \| blob \| history
fs/ocfs2/symlink.c		patch \| blob \| history
fs/ocfs2/symlink.h		patch \| blob \| history
fs/open.c		patch \| blob \| history
fs/pipe.c		patch \| blob \| history
fs/pnode.c		patch \| blob \| history
fs/proc_namespace.c		patch \| blob \| history
fs/readdir.c		patch \| blob \| history
fs/reiserfs/inode.c		patch \| blob \| history
fs/reiserfs/journal.c		patch \| blob \| history
fs/reiserfs/reiserfs.h		patch \| blob \| history
fs/reiserfs/resize.c		patch \| blob \| history
fs/reiserfs/super.c		patch \| blob \| history
fs/signalfd.c		patch \| blob \| history
fs/splice.c		patch \| blob \| history
fs/statfs.c		patch \| blob \| history
fs/sync.c		patch \| blob \| history
fs/ubifs/dir.c		patch \| blob \| history
fs/udf/namei.c		patch \| blob \| history
fs/utimes.c		patch \| blob \| history
fs/xattr.c		patch \| blob \| history
fs/xfs/kmem.c		patch \| blob \| history
fs/xfs/kmem.h		patch \| blob \| history
fs/xfs/xfs_export.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log_priv.h		patch \| blob \| history
fs/xfs/xfs_trans.c		patch \| blob \| history
fs/xfs/xfs_trans.h		patch \| blob \| history
include/asm-generic/posix_types.h		patch \| blob \| history
include/linux/errno.h		patch \| blob \| history
include/linux/exportfs.h		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/fsnotify_backend.h		patch \| blob \| history
include/linux/jbd2.h		patch \| blob \| history
include/linux/jbd_common.h		patch \| blob \| history
include/linux/lglock.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/security.h		patch \| blob \| history
include/linux/sunrpc/svc.h		patch \| blob \| history
include/linux/sunrpc/svcauth.h		patch \| blob \| history
include/linux/sunrpc/svcauth_gss.h		patch \| blob \| history
include/linux/types.h		patch \| blob \| history
ipc/shm.c		patch \| blob \| history
kernel/Makefile		patch \| blob \| history
kernel/lglock.c	[new file with mode: 0644]	patch \| blob
mm/cleancache.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history
mm/filemap_xip.c		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history
mm/nommu.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/util.c		patch \| blob \| history
net/sched/sch_atm.c		patch \| blob \| history
net/sunrpc/auth_gss/gss_krb5_wrap.c		patch \| blob \| history
net/sunrpc/auth_gss/svcauth_gss.c		patch \| blob \| history
net/sunrpc/rpcb_clnt.c		patch \| blob \| history
net/sunrpc/svc.c		patch \| blob \| history
net/sunrpc/svc_xprt.c		patch \| blob \| history
net/sunrpc/svcauth_unix.c		patch \| blob \| history
security/apparmor/lsm.c		patch \| blob \| history
security/capability.c		patch \| blob \| history
security/commoncap.c		patch \| blob \| history
security/security.c		patch \| blob \| history
security/selinux/hooks.c		patch \| blob \| history
security/selinux/selinuxfs.c		patch \| blob \| history
security/smack/smack_lsm.c		patch \| blob \| history