Merge branch 'next-evm' of git://git.kernel.org/pub/scm/linux/kernel/git/zohar/ima...

author James Morris <jmorris@namei.org>

Tue, 9 Aug 2011 00:31:03 +0000 (10:31 +1000)

committer James Morris <jmorris@namei.org>

Tue, 9 Aug 2011 00:31:03 +0000 (10:31 +1000)
author James Morris <jmorris@namei.org>
Tue, 9 Aug 2011 00:31:03 +0000 (10:31 +1000)
committer James Morris <jmorris@namei.org>
Tue, 9 Aug 2011 00:31:03 +0000 (10:31 +1000)
diff --combined Documentation/kernel-parameters.txt

index e279b724291239677041f3d4a3f002e566db8ae6,db97ff1da8c00a0c3ff4134a70d61e7db3559127..cd7c861101472aa662dd08c9f89b7c82d23b0cd9
--- 1/Documentation/kernel-parameters.txt
--- 2/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@@ -48,6 -48,7 +48,7 @@@ parameter is applicable
         EDD     BIOS Enhanced Disk Drive Services (EDD) is enabled
         EFI     EFI Partitioning (GPT) is enabled
         EIDE    EIDE/ATAPI support is enabled.
+       EVM     Extended Verification Module
         FB      The frame buffer device is enabled.
         GCOV    GCOV profiling is enabled.
         HW      Appropriate hardware is enabled.
@@@ -163,11 -164,6 +164,11 @@@ bytes respectively. Such letter suffixe
   
                         See also Documentation/power/pm.txt, pci=noacpi
   
+ +      acpi_rsdp=      [ACPI,EFI,KEXEC]
+ +                      Pass the RSDP address to the kernel, mostly used
+ +                      on machines running EFI runtime service to boot the
+ +                      second kernel for kdump.
+ +
         acpi_apic_instance=     [ACPI, IOAPIC]
                         Format: <int>
                         2: use 2nd APIC table, if available
@@@ -551,9 -547,6 +552,9 @@@
                         /proc/<pid>/coredump_filter.
                         See also Documentation/filesystems/proc.txt.
   
+ +      cpuidle.off=1   [CPU_IDLE]
+ +                      disable the cpuidle sub-system
+ +
         cpcihp_generic= [HW,PCI] Generic port I/O CompactPCI driver
                         Format:
                         <first_slot>,<last_slot>,<port>,<enum_bit>[,<debug>]
@@@ -758,6 -751,11 +759,11 @@@
                         This option is obsoleted by the "netdev=" option, which
                         has equivalent usage. See its documentation for details.
   
+       evm=            [EVM]
+                       Format: { "fix" }
+                       Permit 'security.evm' to be updated regardless of
+                       current integrity status.
+ 
         failslab=
         fail_page_alloc=
         fail_make_request=[KNL]
@@@ -1167,6 -1165,10 +1173,6 @@@
                         for all guests.
                         Default is 1 (enabled) if in 64bit or 32bit-PAE mode
   
- -      kvm-intel.bypass_guest_pf=
- -                      [KVM,Intel] Disables bypassing of guest page faults
- -                      on Intel chips. Default is 1 (enabled)
- -
         kvm-intel.ept=  [KVM,Intel] Disable extended page tables
                         (virtualized MMU) support on capable Intel chips.
                         Default is 1 (enabled)
@@@ -1741,10 -1743,6 +1747,10 @@@
         no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
                         fault handling.
   
+ +      no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
+ +                      steal time is computed, but won't influence scheduler
+ +                      behaviour
+ +
         nolapic         [X86-32,APIC] Do not enable or use the local APIC.
   
         nolapic_timer   [X86-32,APIC] Do not use the local APIC timer.
@@@ -1854,9 -1852,7 +1860,9 @@@
                         See Documentation/sound/oss/oss-parameters.txt
   
         panic=          [KNL] Kernel behaviour on panic: delay <timeout>
- -                      seconds before rebooting
+ +                      timeout > 0: seconds before rebooting
+ +                      timeout = 0: wait forever
+ +                      timeout < 0: reboot immediately
                         Format: <timeout>
   
         parkbd.port=    [HW] Parallel port number the keyboard adapter is
@@@ -2025,8 -2021,6 +2031,8 @@@
                                 the default.
                                 off: Turn ECRC off
                                 on: Turn ECRC on.
+ +              realloc         reallocate PCI resources if allocations done by BIOS
+ +                              are erroneous.
   
         pcie_aspm=      [PCIE] Forcibly enable or disable PCIe Active State Power
                         Management.
@@@ -2161,8 -2155,6 +2167,8 @@@
                         [HW,MOUSE] Controls Logitech smartscroll autorepeat.
                         0 = disabled, 1 = enabled (default).
   
+ +      pstore.backend= Specify the name of the pstore backend to use
+ +
         pt.             [PARIDE]
                         See Documentation/blockdev/paride.txt.
   
@@@ -2248,7 -2240,6 +2254,7 @@@
         ro              [KNL] Mount root device read-only on boot
   
         root=           [KNL] Root filesystem
+ +                      See name_to_dev_t comment in init/do_mounts.c.
   
         rootdelay=      [KNL] Delay (in seconds) to pause before attempting to
                         mount the root filesystem
@@@ -2539,11 -2530,6 +2545,11 @@@
                         <port#>,<js1>,<js2>,<js3>,<js4>,<js5>,<js6>,<js7>
                         See also Documentation/input/joystick-parport.txt
   
+ +      udbg-immortal   [PPC] When debugging early kernel crashes that
+ +                      happen after console_init() and before a proper 
+ +                      console driver takes over, this boot options might
+ +                      help "seeing" what's going on.
+ +
         uhash_entries=  [KNL,NET]
                         Set number of hash buckets for UDP/UDP-Lite connections
   
@@@ -2558,11 -2544,6 +2564,11 @@@
         unknown_nmi_panic
                         [X86] Cause panic on unknown NMI.
   
+ +      usbcore.authorized_default=
+ +                      [USB] Default USB device authorization:
+ +                      (default -1 = authorized except for wireless USB,
+ +                      0 = not authorized, 1 = authorized)
+ +
         usbcore.autosuspend=
                         [USB] The autosuspend time delay (in seconds) used
                         for newly-detected USB devices (default 2).  This
diff --combined fs/attr.c

index 538e27959d3f7fec79a8a0647723830b2bb4a5b4,5ad45d3cc20ac4a7011b5635e46f96334d455611..7ee7ba48831358f7bf4f738a13c20511dcdd1698
--- 1/fs/attr.c
--- 2/fs/attr.c
+++ b/fs/attr.c
@@@ -13,6 -13,7 +13,7 @@@
   #include <linux/fsnotify.h>
   #include <linux/fcntl.h>
   #include <linux/security.h>
+ #include <linux/evm.h>
   
   /**
    * inode_change_ok - check if attribute changes to an inode are allowed
@@@ -232,13 -233,21 +233,15 @@@ int notify_change(struct dentry * dentr
         if (error)
                 return error;
   
- -      if (ia_valid & ATTR_SIZE)
- -              down_write(&dentry->d_inode->i_alloc_sem);
- -
         if (inode->i_op->setattr)
                 error = inode->i_op->setattr(dentry, attr);
         else
                 error = simple_setattr(dentry, attr);
   
-       if (!error)
- -      if (ia_valid & ATTR_SIZE)
- -              up_write(&dentry->d_inode->i_alloc_sem);
- -
+       if (!error) {
                 fsnotify_change(dentry, ia_valid);
+               evm_inode_post_setattr(dentry, ia_valid);
+       }
   
         return error;
   }
diff --combined fs/btrfs/xattr.c

index d733b9cfea343207e71bdb6d8d8b51323717c800,a039e6ed4ce0ed2812951d87e89974c6c0e3ab4c..6196e1a76c14fae0be3f584c9440429f68eef72a
--- 1/fs/btrfs/xattr.c
--- 2/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@@ -102,57 -102,43 +102,57 @@@ static int do_setxattr(struct btrfs_tra
         if (!path)
                 return -ENOMEM;
   
- -      /* first lets see if we already have this xattr */
- -      di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
- -                              strlen(name), -1);
- -      if (IS_ERR(di)) {
- -              ret = PTR_ERR(di);
- -              goto out;
- -      }
- -
- -      /* ok we already have this xattr, lets remove it */
- -      if (di) {
- -              /* if we want create only exit */
- -              if (flags & XATTR_CREATE) {
- -                      ret = -EEXIST;
+ +      if (flags & XATTR_REPLACE) {
+ +              di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
+ +                                      name_len, -1);
+ +              if (IS_ERR(di)) {
+ +                      ret = PTR_ERR(di);
+ +                      goto out;
+ +              } else if (!di) {
+ +                      ret = -ENODATA;
                         goto out;
                 }
- -
                 ret = btrfs_delete_one_dir_name(trans, root, path, di);
- -              BUG_ON(ret);
+ +              if (ret)
+ +                      goto out;
                 btrfs_release_path(path);
+ +      }
   
- -              /* if we don't have a value then we are removing the xattr */
- -              if (!value)
+ +again:
+ +      ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
+ +                                    name, name_len, value, size);
+ +      if (ret == -EEXIST) {
+ +              if (flags & XATTR_CREATE)
                         goto out;
- -      } else {
+ +              /*
+ +               * We can't use the path we already have since we won't have the
+ +               * proper locking for a delete, so release the path and
+ +               * re-lookup to delete the thing.
+ +               */
                 btrfs_release_path(path);
+ +              di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode),
+ +                                      name, name_len, -1);
+ +              if (IS_ERR(di)) {
+ +                      ret = PTR_ERR(di);
+ +                      goto out;
+ +              } else if (!di) {
+ +                      /* Shouldn't happen but just in case... */
+ +                      btrfs_release_path(path);
+ +                      goto again;
+ +              }
   
- -              if (flags & XATTR_REPLACE) {
- -                      /* we couldn't find the attr to replace */
- -                      ret = -ENODATA;
+ +              ret = btrfs_delete_one_dir_name(trans, root, path, di);
+ +              if (ret)
                         goto out;
+ +
+ +              /*
+ +               * We have a value to set, so go back and try to insert it now.
+ +               */
+ +              if (value) {
+ +                      btrfs_release_path(path);
+ +                      goto again;
                 }
         }
- -
- -      /* ok we have to create a completely new xattr */
- -      ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
- -                                    name, name_len, value, size);
- -      BUG_ON(ret);
   out:
         btrfs_free_path(path);
         return ret;
@@@ -374,36 -360,36 +374,36 @@@ int btrfs_removexattr(struct dentry *de
                                 XATTR_REPLACE);
   }
   
- int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                             struct inode *inode, struct inode *dir,
-                             const struct qstr *qstr)
+ int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                    void *fs_info)
   {
-       int err;
-       size_t len;
-       void *value;
-       char *suffix;
+       const struct xattr *xattr;
+       struct btrfs_trans_handle *trans = fs_info;
         char *name;
+       int err = 0;
   
-       err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-                                          &len);
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       return 0;
-               return err;
-       }
- 
-       name = kmalloc(XATTR_SECURITY_PREFIX_LEN + strlen(suffix) + 1,
-                      GFP_NOFS);
-       if (!name) {
-               err = -ENOMEM;
-       } else {
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+                              strlen(xattr->name) + 1, GFP_NOFS);
+               if (!name) {
+                       err = -ENOMEM;
+                       break;
+               }
                 strcpy(name, XATTR_SECURITY_PREFIX);
-               strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
-               err = __btrfs_setxattr(trans, inode, name, value, len, 0);
+               strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+               err = __btrfs_setxattr(trans, inode, name,
+                                      xattr->value, xattr->value_len, 0);
                 kfree(name);
+               if (err < 0)
+                       break;
         }
- 
-       kfree(suffix);
-       kfree(value);
         return err;
   }
+ 
+ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
+                             struct inode *inode, struct inode *dir,
+                             const struct qstr *qstr)
+ {
+       return security_inode_init_security(inode, dir, qstr,
+                                           &btrfs_initxattrs, trans);
+ }
diff --combined fs/gfs2/inode.c

index 900cf986aadcc155d26028971f71bc6fa9a972ec,1d3a1a65172193a62f09c6bd5430d70122af10fa..6525b804d5ecb6c8a917aeed577272458183af65
--- 1/fs/gfs2/inode.c
--- 2/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@@ -307,7 -307,7 +307,7 @@@ struct inode *gfs2_lookupi(struct inod
         }
   
         if (!is_root) {
- -              error = gfs2_permission(dir, MAY_EXEC, 0);
+ +              error = gfs2_permission(dir, MAY_EXEC);
                 if (error)
                         goto out;
         }
@@@ -337,7 -337,7 +337,7 @@@ static int create_ok(struct gfs2_inode 
   {
         int error;
   
- -      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+ +      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
   
@@@ -624,31 -624,29 +624,29 @@@ fail
         return error;
   }
   
- static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
-                             const struct qstr *qstr)
+ int gfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                   void *fs_info)
   {
-       int err;
-       size_t len;
-       void *value;
-       char *name;
- 
-       err = security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
-                                          &name, &value, &len);
- 
-       if (err) {
-               if (err == -EOPNOTSUPP)
-                       return 0;
-               return err;
+       const struct xattr *xattr;
+       int err = 0;
+ 
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               err = __gfs2_xattr_set(inode, xattr->name, xattr->value,
+                                      xattr->value_len, 0,
+                                      GFS2_EATYPE_SECURITY);
+               if (err < 0)
+                       break;
         }
- 
-       err = __gfs2_xattr_set(&ip->i_inode, name, value, len, 0,
-                              GFS2_EATYPE_SECURITY);
-       kfree(value);
-       kfree(name);
- 
         return err;
   }
   
+ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip,
+                             const struct qstr *qstr)
+ {
+       return security_inode_init_security(&ip->i_inode, &dip->i_inode, qstr,
+                                           &gfs2_initxattrs, NULL);
+ }
+ 
   /**
    * gfs2_create_inode - Create a new inode
    * @dir: The parent directory
@@@ -792,8 -790,13 +790,8 @@@ static int gfs2_create(struct inode *di
   static struct dentry *gfs2_lookup(struct inode *dir, struct dentry *dentry,
                                   struct nameidata *nd)
   {
- -      struct inode *inode = NULL;
- -
- -      inode = gfs2_lookupi(dir, &dentry->d_name, 0);
- -      if (inode && IS_ERR(inode))
- -              return ERR_CAST(inode);
- -
- -      if (inode) {
+ +      struct inode *inode = gfs2_lookupi(dir, &dentry->d_name, 0);
+ +      if (inode && !IS_ERR(inode)) {
                 struct gfs2_glock *gl = GFS2_I(inode)->i_gl;
                 struct gfs2_holder gh;
                 int error;
@@@ -803,8 -806,11 +801,8 @@@
                         return ERR_PTR(error);
                 }
                 gfs2_glock_dq_uninit(&gh);
- -              return d_splice_alias(inode, dentry);
         }
- -      d_add(dentry, inode);
- -
- -      return NULL;
+ +      return d_splice_alias(inode, dentry);
   }
   
   /**
@@@ -849,7 -855,7 +847,7 @@@ static int gfs2_link(struct dentry *old
         if (inode->i_nlink == 0)
                 goto out_gunlock;
   
- -      error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC, 0);
+ +      error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC);
         if (error)
                 goto out_gunlock;
   
@@@ -982,7 -988,7 +980,7 @@@ static int gfs2_unlink_ok(struct gfs2_i
         if (IS_APPEND(&dip->i_inode))
                 return -EPERM;
   
- -      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC, 0);
+ +      error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
   
@@@ -1328,7 -1334,7 +1326,7 @@@ static int gfs2_rename(struct inode *od
                         }
                 }
         } else {
- -              error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC, 0);
+ +              error = gfs2_permission(ndir, MAY_WRITE | MAY_EXEC);
                 if (error)
                         goto out_gunlock;
   
@@@ -1363,7 -1369,7 +1361,7 @@@
         /* Check out the dir to be renamed */
   
         if (dir_rename) {
- -              error = gfs2_permission(odentry->d_inode, MAY_WRITE, 0);
+ +              error = gfs2_permission(odentry->d_inode, MAY_WRITE);
                 if (error)
                         goto out_gunlock;
         }
@@@ -1535,7 -1541,7 +1533,7 @@@ static void gfs2_put_link(struct dentr
    * Returns: errno
    */
   
- -int gfs2_permission(struct inode *inode, int mask, unsigned int flags)
+ +int gfs2_permission(struct inode *inode, int mask)
   {
         struct gfs2_inode *ip;
         struct gfs2_holder i_gh;
@@@ -1545,7 -1551,7 +1543,7 @@@
   
         ip = GFS2_I(inode);
         if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
- -              if (flags & IPERM_FLAG_RCU)
+ +              if (mask & MAY_NOT_BLOCK)
                         return -ECHILD;
                 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
                 if (error)
@@@ -1556,7 -1562,7 +1554,7 @@@
         if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
                 error = -EACCES;
         else
- -              error = generic_permission(inode, mask, flags, gfs2_check_acl);
+ +              error = generic_permission(inode, mask);
         if (unlock)
                 gfs2_glock_dq_uninit(&i_gh);
   
@@@ -1846,7 -1852,6 +1844,7 @@@ const struct inode_operations gfs2_file
         .listxattr = gfs2_listxattr,
         .removexattr = gfs2_removexattr,
         .fiemap = gfs2_fiemap,
+ +      .get_acl = gfs2_get_acl,
   };
   
   const struct inode_operations gfs2_dir_iops = {
@@@ -1867,7 -1872,6 +1865,7 @@@
         .listxattr = gfs2_listxattr,
         .removexattr = gfs2_removexattr,
         .fiemap = gfs2_fiemap,
+ +      .get_acl = gfs2_get_acl,
   };
   
   const struct inode_operations gfs2_symlink_iops = {
@@@ -1882,6 -1886,5 +1880,6 @@@
         .listxattr = gfs2_listxattr,
         .removexattr = gfs2_removexattr,
         .fiemap = gfs2_fiemap,
+ +      .get_acl = gfs2_get_acl,
   };
   
diff --combined fs/jfs/xattr.c

index e87fedef23db7938aa7b38f64f385b0615e193b4,e982509292f8ed02f48723eb5086c04f6add66df..26683e15b3ac9f23991110687023e0f6e3bb0db0
--- 1/fs/jfs/xattr.c
--- 2/fs/jfs/xattr.c
+++ b/fs/jfs/xattr.c
@@@ -693,7 -693,8 +693,7 @@@ static int can_set_system_xattr(struct 
                         return rc;
                 }
                 if (acl) {
- -                      mode_t mode = inode->i_mode;
- -                      rc = posix_acl_equiv_mode(acl, &mode);
+ +                      rc = posix_acl_equiv_mode(acl, &inode->i_mode);
                         posix_acl_release(acl);
                         if (rc < 0) {
                                 printk(KERN_ERR
@@@ -701,6 -702,7 +701,6 @@@
                                        rc);
                                 return rc;
                         }
- -                      inode->i_mode = mode;
                         mark_inode_dirty(inode);
                 }
                 /*
@@@ -1089,38 -1091,37 +1089,37 @@@ int jfs_removexattr(struct dentry *dent
   }
   
   #ifdef CONFIG_JFS_SECURITY
- int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
-                     const struct qstr *qstr)
+ int jfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                  void *fs_info)
   {
-       int rc;
-       size_t len;
-       void *value;
-       char *suffix;
+       const struct xattr *xattr;
+       tid_t *tid = fs_info;
         char *name;
- 
-       rc = security_inode_init_security(inode, dir, qstr, &suffix, &value,
-                                         &len);
-       if (rc) {
-               if (rc == -EOPNOTSUPP)
-                       return 0;
-               return rc;
-       }
-       name = kmalloc(XATTR_SECURITY_PREFIX_LEN + 1 + strlen(suffix),
-                      GFP_NOFS);
-       if (!name) {
-               rc = -ENOMEM;
-               goto kmalloc_failed;
+       int err = 0;
+ 
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+                              strlen(xattr->name) + 1, GFP_NOFS);
+               if (!name) {
+                       err = -ENOMEM;
+                       break;
+               }
+               strcpy(name, XATTR_SECURITY_PREFIX);
+               strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+ 
+               err = __jfs_setxattr(*tid, inode, name,
+                                    xattr->value, xattr->value_len, 0);
+               kfree(name);
+               if (err < 0)
+                       break;
         }
-       strcpy(name, XATTR_SECURITY_PREFIX);
-       strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix);
- 
-       rc = __jfs_setxattr(tid, inode, name, value, len, 0);
- 
-       kfree(name);
- kmalloc_failed:
-       kfree(suffix);
-       kfree(value);
+       return err;
+ }
   
-       return rc;
+ int jfs_init_security(tid_t tid, struct inode *inode, struct inode *dir,
+                     const struct qstr *qstr)
+ {
+       return security_inode_init_security(inode, dir, qstr,
+                                           &jfs_initxattrs, &tid);
   }
   #endif
diff --combined fs/xfs/linux-2.6/xfs_iops.c

index b9c172b3fbbec1c6b8ac2874151f0bb1aa19240d,27a3658b830f8cdd294820ffddde092ce441574b..37194607162d38c326804c5e55a279e4124aa975
--- 1/fs/xfs/linux-2.6/xfs_iops.c
--- 2/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@@ -39,7 -39,6 +39,7 @@@
   #include "xfs_buf_item.h"
   #include "xfs_utils.h"
   #include "xfs_vnodeops.h"
+ +#include "xfs_inode_item.h"
   #include "xfs_trace.h"
   
   #include <linux/capability.h>
@@@ -94,37 -93,38 +94,38 @@@ xfs_mark_inode_dirty
                 mark_inode_dirty(inode);
   }
   
+ 
+ int xfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
+                  void *fs_info)
+ {
+       const struct xattr *xattr;
+       struct xfs_inode *ip = XFS_I(inode);
+       int error = 0;
+ 
+       for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+               error = xfs_attr_set(ip, xattr->name, xattr->value,
+                                    xattr->value_len, ATTR_SECURE);
+               if (error < 0)
+                       break;
+       }
+       return error;
+ }
+ 
   /*
    * Hook in SELinux.  This is not quite correct yet, what we really need
    * here (as we do for default ACLs) is a mechanism by which creation of
    * these attrs can be journalled at inode creation time (along with the
    * inode, of course, such that log replay can't cause these to be lost).
    */
+ 
   STATIC int
   xfs_init_security(
         struct inode    *inode,
         struct inode    *dir,
         const struct qstr *qstr)
   {
-       struct xfs_inode *ip = XFS_I(inode);
-       size_t          length;
-       void            *value;
-       unsigned char   *name;
-       int             error;
- 
-       error = security_inode_init_security(inode, dir, qstr, (char **)&name,
-                                            &value, &length);
-       if (error) {
-               if (error == -EOPNOTSUPP)
-                       return 0;
-               return -error;
-       }
- 
-       error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
- 
-       kfree(name);
-       kfree(value);
-       return error;
+       return security_inode_init_security(inode, dir, qstr,
+                                           &xfs_initxattrs, NULL);
   }
   
   static void
@@@ -202,9 -202,9 +203,9 @@@ xfs_vn_mknod
   
         if (default_acl) {
                 error = -xfs_inherit_acl(inode, default_acl);
+ +              default_acl = NULL;
                 if (unlikely(error))
                         goto out_cleanup_inode;
- -              posix_acl_release(default_acl);
         }
   
   
@@@ -498,442 -498,12 +499,442 @@@ xfs_vn_getattr
         return 0;
   }
   
+ +int
+ +xfs_setattr_nonsize(
+ +      struct xfs_inode        *ip,
+ +      struct iattr            *iattr,
+ +      int                     flags)
+ +{
+ +      xfs_mount_t             *mp = ip->i_mount;
+ +      struct inode            *inode = VFS_I(ip);
+ +      int                     mask = iattr->ia_valid;
+ +      xfs_trans_t             *tp;
+ +      int                     error;
+ +      uid_t                   uid = 0, iuid = 0;
+ +      gid_t                   gid = 0, igid = 0;
+ +      struct xfs_dquot        *udqp = NULL, *gdqp = NULL;
+ +      struct xfs_dquot        *olddquot1 = NULL, *olddquot2 = NULL;
+ +
+ +      trace_xfs_setattr(ip);
+ +
+ +      if (mp->m_flags & XFS_MOUNT_RDONLY)
+ +              return XFS_ERROR(EROFS);
+ +
+ +      if (XFS_FORCED_SHUTDOWN(mp))
+ +              return XFS_ERROR(EIO);
+ +
+ +      error = -inode_change_ok(inode, iattr);
+ +      if (error)
+ +              return XFS_ERROR(error);
+ +
+ +      ASSERT((mask & ATTR_SIZE) == 0);
+ +
+ +      /*
+ +       * If disk quotas is on, we make sure that the dquots do exist on disk,
+ +       * before we start any other transactions. Trying to do this later
+ +       * is messy. We don't care to take a readlock to look at the ids
+ +       * in inode here, because we can't hold it across the trans_reserve.
+ +       * If the IDs do change before we take the ilock, we're covered
+ +       * because the i_*dquot fields will get updated anyway.
+ +       */
+ +      if (XFS_IS_QUOTA_ON(mp) && (mask & (ATTR_UID|ATTR_GID))) {
+ +              uint    qflags = 0;
+ +
+ +              if ((mask & ATTR_UID) && XFS_IS_UQUOTA_ON(mp)) {
+ +                      uid = iattr->ia_uid;
+ +                      qflags |= XFS_QMOPT_UQUOTA;
+ +              } else {
+ +                      uid = ip->i_d.di_uid;
+ +              }
+ +              if ((mask & ATTR_GID) && XFS_IS_GQUOTA_ON(mp)) {
+ +                      gid = iattr->ia_gid;
+ +                      qflags |= XFS_QMOPT_GQUOTA;
+ +              }  else {
+ +                      gid = ip->i_d.di_gid;
+ +              }
+ +
+ +              /*
+ +               * We take a reference when we initialize udqp and gdqp,
+ +               * so it is important that we never blindly double trip on
+ +               * the same variable. See xfs_create() for an example.
+ +               */
+ +              ASSERT(udqp == NULL);
+ +              ASSERT(gdqp == NULL);
+ +              error = xfs_qm_vop_dqalloc(ip, uid, gid, xfs_get_projid(ip),
+ +                                       qflags, &udqp, &gdqp);
+ +              if (error)
+ +                      return error;
+ +      }
+ +
+ +      tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+ +      error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ +      if (error)
+ +              goto out_dqrele;
+ +
+ +      xfs_ilock(ip, XFS_ILOCK_EXCL);
+ +
+ +      /*
+ +       * Change file ownership.  Must be the owner or privileged.
+ +       */
+ +      if (mask & (ATTR_UID|ATTR_GID)) {
+ +              /*
+ +               * These IDs could have changed since we last looked at them.
+ +               * But, we're assured that if the ownership did change
+ +               * while we didn't have the inode locked, inode's dquot(s)
+ +               * would have changed also.
+ +               */
+ +              iuid = ip->i_d.di_uid;
+ +              igid = ip->i_d.di_gid;
+ +              gid = (mask & ATTR_GID) ? iattr->ia_gid : igid;
+ +              uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
+ +
+ +              /*
+ +               * Do a quota reservation only if uid/gid is actually
+ +               * going to change.
+ +               */
+ +              if (XFS_IS_QUOTA_RUNNING(mp) &&
+ +                  ((XFS_IS_UQUOTA_ON(mp) && iuid != uid) ||
+ +                   (XFS_IS_GQUOTA_ON(mp) && igid != gid))) {
+ +                      ASSERT(tp);
+ +                      error = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp,
+ +                                              capable(CAP_FOWNER) ?
+ +                                              XFS_QMOPT_FORCE_RES : 0);
+ +                      if (error)      /* out of quota */
+ +                              goto out_trans_cancel;
+ +              }
+ +      }
+ +
+ +      xfs_trans_ijoin(tp, ip);
+ +
+ +      /*
+ +       * Change file ownership.  Must be the owner or privileged.
+ +       */
+ +      if (mask & (ATTR_UID|ATTR_GID)) {
+ +              /*
+ +               * CAP_FSETID overrides the following restrictions:
+ +               *
+ +               * The set-user-ID and set-group-ID bits of a file will be
+ +               * cleared upon successful return from chown()
+ +               */
+ +              if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+ +                  !capable(CAP_FSETID))
+ +                      ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
+ +
+ +              /*
+ +               * Change the ownerships and register quota modifications
+ +               * in the transaction.
+ +               */
+ +              if (iuid != uid) {
+ +                      if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_UQUOTA_ON(mp)) {
+ +                              ASSERT(mask & ATTR_UID);
+ +                              ASSERT(udqp);
+ +                              olddquot1 = xfs_qm_vop_chown(tp, ip,
+ +                                                      &ip->i_udquot, udqp);
+ +                      }
+ +                      ip->i_d.di_uid = uid;
+ +                      inode->i_uid = uid;
+ +              }
+ +              if (igid != gid) {
+ +                      if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_GQUOTA_ON(mp)) {
+ +                              ASSERT(!XFS_IS_PQUOTA_ON(mp));
+ +                              ASSERT(mask & ATTR_GID);
+ +                              ASSERT(gdqp);
+ +                              olddquot2 = xfs_qm_vop_chown(tp, ip,
+ +                                                      &ip->i_gdquot, gdqp);
+ +                      }
+ +                      ip->i_d.di_gid = gid;
+ +                      inode->i_gid = gid;
+ +              }
+ +      }
+ +
+ +      /*
+ +       * Change file access modes.
+ +       */
+ +      if (mask & ATTR_MODE) {
+ +              umode_t mode = iattr->ia_mode;
+ +
+ +              if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ +                      mode &= ~S_ISGID;
+ +
+ +              ip->i_d.di_mode &= S_IFMT;
+ +              ip->i_d.di_mode |= mode & ~S_IFMT;
+ +
+ +              inode->i_mode &= S_IFMT;
+ +              inode->i_mode |= mode & ~S_IFMT;
+ +      }
+ +
+ +      /*
+ +       * Change file access or modified times.
+ +       */
+ +      if (mask & ATTR_ATIME) {
+ +              inode->i_atime = iattr->ia_atime;
+ +              ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec;
+ +              ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec;
+ +              ip->i_update_core = 1;
+ +      }
+ +      if (mask & ATTR_CTIME) {
+ +              inode->i_ctime = iattr->ia_ctime;
+ +              ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ +              ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+ +              ip->i_update_core = 1;
+ +      }
+ +      if (mask & ATTR_MTIME) {
+ +              inode->i_mtime = iattr->ia_mtime;
+ +              ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ +              ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+ +              ip->i_update_core = 1;
+ +      }
+ +
+ +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ +
+ +      XFS_STATS_INC(xs_ig_attrchg);
+ +
+ +      if (mp->m_flags & XFS_MOUNT_WSYNC)
+ +              xfs_trans_set_sync(tp);
+ +      error = xfs_trans_commit(tp, 0);
+ +
+ +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ +
+ +      /*
+ +       * Release any dquot(s) the inode had kept before chown.
+ +       */
+ +      xfs_qm_dqrele(olddquot1);
+ +      xfs_qm_dqrele(olddquot2);
+ +      xfs_qm_dqrele(udqp);
+ +      xfs_qm_dqrele(gdqp);
+ +
+ +      if (error)
+ +              return XFS_ERROR(error);
+ +
+ +      /*
+ +       * XXX(hch): Updating the ACL entries is not atomic vs the i_mode
+ +       *           update.  We could avoid this with linked transactions
+ +       *           and passing down the transaction pointer all the way
+ +       *           to attr_set.  No previous user of the generic
+ +       *           Posix ACL code seems to care about this issue either.
+ +       */
+ +      if ((mask & ATTR_MODE) && !(flags & XFS_ATTR_NOACL)) {
+ +              error = -xfs_acl_chmod(inode);
+ +              if (error)
+ +                      return XFS_ERROR(error);
+ +      }
+ +
+ +      return 0;
+ +
+ +out_trans_cancel:
+ +      xfs_trans_cancel(tp, 0);
+ +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ +out_dqrele:
+ +      xfs_qm_dqrele(udqp);
+ +      xfs_qm_dqrele(gdqp);
+ +      return error;
+ +}
+ +
+ +/*
+ + * Truncate file.  Must have write permission and not be a directory.
+ + */
+ +int
+ +xfs_setattr_size(
+ +      struct xfs_inode        *ip,
+ +      struct iattr            *iattr,
+ +      int                     flags)
+ +{
+ +      struct xfs_mount        *mp = ip->i_mount;
+ +      struct inode            *inode = VFS_I(ip);
+ +      int                     mask = iattr->ia_valid;
+ +      struct xfs_trans        *tp;
+ +      int                     error;
+ +      uint                    lock_flags;
+ +      uint                    commit_flags = 0;
+ +
+ +      trace_xfs_setattr(ip);
+ +
+ +      if (mp->m_flags & XFS_MOUNT_RDONLY)
+ +              return XFS_ERROR(EROFS);
+ +
+ +      if (XFS_FORCED_SHUTDOWN(mp))
+ +              return XFS_ERROR(EIO);
+ +
+ +      error = -inode_change_ok(inode, iattr);
+ +      if (error)
+ +              return XFS_ERROR(error);
+ +
+ +      ASSERT(S_ISREG(ip->i_d.di_mode));
+ +      ASSERT((mask & (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_ATIME|ATTR_ATIME_SET|
+ +                      ATTR_MTIME_SET|ATTR_KILL_SUID|ATTR_KILL_SGID|
+ +                      ATTR_KILL_PRIV|ATTR_TIMES_SET)) == 0);
+ +
+ +      lock_flags = XFS_ILOCK_EXCL;
+ +      if (!(flags & XFS_ATTR_NOLOCK))
+ +              lock_flags |= XFS_IOLOCK_EXCL;
+ +      xfs_ilock(ip, lock_flags);
+ +
+ +      /*
+ +       * Short circuit the truncate case for zero length files.
+ +       */
+ +      if (iattr->ia_size == 0 &&
+ +          ip->i_size == 0 && ip->i_d.di_nextents == 0) {
+ +              if (!(mask & (ATTR_CTIME|ATTR_MTIME)))
+ +                      goto out_unlock;
+ +
+ +              /*
+ +               * Use the regular setattr path to update the timestamps.
+ +               */
+ +              xfs_iunlock(ip, lock_flags);
+ +              iattr->ia_valid &= ~ATTR_SIZE;
+ +              return xfs_setattr_nonsize(ip, iattr, 0);
+ +      }
+ +
+ +      /*
+ +       * Make sure that the dquots are attached to the inode.
+ +       */
+ +      error = xfs_qm_dqattach_locked(ip, 0);
+ +      if (error)
+ +              goto out_unlock;
+ +
+ +      /*
+ +       * Now we can make the changes.  Before we join the inode to the
+ +       * transaction, take care of the part of the truncation that must be
+ +       * done without the inode lock.  This needs to be done before joining
+ +       * the inode to the transaction, because the inode cannot be unlocked
+ +       * once it is a part of the transaction.
+ +       */
+ +      if (iattr->ia_size > ip->i_size) {
+ +              /*
+ +               * Do the first part of growing a file: zero any data in the
+ +               * last block that is beyond the old EOF.  We need to do this
+ +               * before the inode is joined to the transaction to modify
+ +               * i_size.
+ +               */
+ +              error = xfs_zero_eof(ip, iattr->ia_size, ip->i_size);
+ +              if (error)
+ +                      goto out_unlock;
+ +      }
+ +      xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ +      lock_flags &= ~XFS_ILOCK_EXCL;
+ +
+ +      /*
+ +       * We are going to log the inode size change in this transaction so
+ +       * any previous writes that are beyond the on disk EOF and the new
+ +       * EOF that have not been written out need to be written here.  If we
+ +       * do not write the data out, we expose ourselves to the null files
+ +       * problem.
+ +       *
+ +       * Only flush from the on disk size to the smaller of the in memory
+ +       * file size or the new size as that's the range we really care about
+ +       * here and prevents waiting for other data not within the range we
+ +       * care about here.
+ +       */
+ +      if (ip->i_size != ip->i_d.di_size && iattr->ia_size > ip->i_d.di_size) {
+ +              error = xfs_flush_pages(ip, ip->i_d.di_size, iattr->ia_size,
+ +                                      XBF_ASYNC, FI_NONE);
+ +              if (error)
+ +                      goto out_unlock;
+ +      }
+ +
+ +      /*
+ +       * Wait for all I/O to complete.
+ +       */
+ +      xfs_ioend_wait(ip);
+ +
+ +      error = -block_truncate_page(inode->i_mapping, iattr->ia_size,
+ +                                   xfs_get_blocks);
+ +      if (error)
+ +              goto out_unlock;
+ +
+ +      tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
+ +      error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
+ +                               XFS_TRANS_PERM_LOG_RES,
+ +                               XFS_ITRUNCATE_LOG_COUNT);
+ +      if (error)
+ +              goto out_trans_cancel;
+ +
+ +      truncate_setsize(inode, iattr->ia_size);
+ +
+ +      commit_flags = XFS_TRANS_RELEASE_LOG_RES;
+ +      lock_flags |= XFS_ILOCK_EXCL;
+ +
+ +      xfs_ilock(ip, XFS_ILOCK_EXCL);
+ +
+ +      xfs_trans_ijoin(tp, ip);
+ +
+ +      /*
+ +       * Only change the c/mtime if we are changing the size or we are
+ +       * explicitly asked to change it.  This handles the semantic difference
+ +       * between truncate() and ftruncate() as implemented in the VFS.
+ +       *
+ +       * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+ +       * special case where we need to update the times despite not having
+ +       * these flags set.  For all other operations the VFS set these flags
+ +       * explicitly if it wants a timestamp update.
+ +       */
+ +      if (iattr->ia_size != ip->i_size &&
+ +          (!(mask & (ATTR_CTIME | ATTR_MTIME)))) {
+ +              iattr->ia_ctime = iattr->ia_mtime =
+ +                      current_fs_time(inode->i_sb);
+ +              mask |= ATTR_CTIME | ATTR_MTIME;
+ +      }
+ +
+ +      if (iattr->ia_size > ip->i_size) {
+ +              ip->i_d.di_size = iattr->ia_size;
+ +              ip->i_size = iattr->ia_size;
+ +      } else if (iattr->ia_size <= ip->i_size ||
+ +                 (iattr->ia_size == 0 && ip->i_d.di_nextents)) {
+ +              error = xfs_itruncate_data(&tp, ip, iattr->ia_size);
+ +              if (error)
+ +                      goto out_trans_abort;
+ +
+ +              /*
+ +               * Truncated "down", so we're removing references to old data
+ +               * here - if we delay flushing for a long time, we expose
+ +               * ourselves unduly to the notorious NULL files problem.  So,
+ +               * we mark this inode and flush it when the file is closed,
+ +               * and do not wait the usual (long) time for writeout.
+ +               */
+ +              xfs_iflags_set(ip, XFS_ITRUNCATED);
+ +      }
+ +
+ +      if (mask & ATTR_CTIME) {
+ +              inode->i_ctime = iattr->ia_ctime;
+ +              ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec;
+ +              ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec;
+ +              ip->i_update_core = 1;
+ +      }
+ +      if (mask & ATTR_MTIME) {
+ +              inode->i_mtime = iattr->ia_mtime;
+ +              ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec;
+ +              ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec;
+ +              ip->i_update_core = 1;
+ +      }
+ +
+ +      xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ +
+ +      XFS_STATS_INC(xs_ig_attrchg);
+ +
+ +      if (mp->m_flags & XFS_MOUNT_WSYNC)
+ +              xfs_trans_set_sync(tp);
+ +
+ +      error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
+ +out_unlock:
+ +      if (lock_flags)
+ +              xfs_iunlock(ip, lock_flags);
+ +      return error;
+ +
+ +out_trans_abort:
+ +      commit_flags |= XFS_TRANS_ABORT;
+ +out_trans_cancel:
+ +      xfs_trans_cancel(tp, commit_flags);
+ +      goto out_unlock;
+ +}
+ +
   STATIC int
   xfs_vn_setattr(
         struct dentry   *dentry,
         struct iattr    *iattr)
   {
- -      return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
+ +      if (iattr->ia_valid & ATTR_SIZE)
+ +              return -xfs_setattr_size(XFS_I(dentry->d_inode), iattr, 0);
+ +      return -xfs_setattr_nonsize(XFS_I(dentry->d_inode), iattr, 0);
   }
   
   #define XFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
@@@ -1022,7 -592,7 +1023,7 @@@ xfs_vn_fiemap
   }
   
   static const struct inode_operations xfs_inode_operations = {
- -      .check_acl              = xfs_check_acl,
+ +      .get_acl                = xfs_get_acl,
         .getattr                = xfs_vn_getattr,
         .setattr                = xfs_vn_setattr,
         .setxattr               = generic_setxattr,
@@@ -1048,7 -618,7 +1049,7 @@@ static const struct inode_operations xf
         .rmdir                  = xfs_vn_unlink,
         .mknod                  = xfs_vn_mknod,
         .rename                 = xfs_vn_rename,
- -      .check_acl              = xfs_check_acl,
+ +      .get_acl                = xfs_get_acl,
         .getattr                = xfs_vn_getattr,
         .setattr                = xfs_vn_setattr,
         .setxattr               = generic_setxattr,
@@@ -1073,7 -643,7 +1074,7 @@@ static const struct inode_operations xf
         .rmdir                  = xfs_vn_unlink,
         .mknod                  = xfs_vn_mknod,
         .rename                 = xfs_vn_rename,
- -      .check_acl              = xfs_check_acl,
+ +      .get_acl                = xfs_get_acl,
         .getattr                = xfs_vn_getattr,
         .setattr                = xfs_vn_setattr,
         .setxattr               = generic_setxattr,
@@@ -1086,7 -656,7 +1087,7 @@@ static const struct inode_operations xf
         .readlink               = generic_readlink,
         .follow_link            = xfs_vn_follow_link,
         .put_link               = xfs_vn_put_link,
- -      .check_acl              = xfs_check_acl,
+ +      .get_acl                = xfs_get_acl,
         .getattr                = xfs_vn_getattr,
         .setattr                = xfs_vn_setattr,
         .setxattr               = generic_setxattr,
@@@ -1194,15 -764,6 +1195,15 @@@ xfs_setup_inode
                 break;
         }
   
+ +      /*
+ +       * If there is no attribute fork no ACL can exist on this inode,
+ +       * and it can't have any file capabilities attached to it either.
+ +       */
+ +      if (!XFS_IFORK_Q(ip)) {
+ +              inode_has_no_xattr(inode);
+ +              cache_no_acl(inode);
+ +      }
+ +
         xfs_iflags_clear(ip, XFS_INEW);
         barrier();
   
diff --combined include/linux/security.h

index ebd2a53a3d073cbf0489848c50725250a3f6872b,6a20c702549534a5d08fea7286625b86e79dd7d0..1c528b19a3291326105c217f2cbeb3dacabc8ac4
--- 1/include/linux/security.h
--- 2/include/linux/security.h
+++ b/include/linux/security.h
@@@ -36,6 -36,7 +36,7 @@@
   #include <linux/key.h>
   #include <linux/xfrm.h>
   #include <linux/slab.h>
+ #include <linux/xattr.h>
   #include <net/flow.h>
   
   /* Maximum number of letters for an LSM name string */
@@@ -147,6 -148,10 +148,10 @@@ extern int mmap_min_addr_handler(struc
                                  void __user *buffer, size_t *lenp, loff_t *ppos);
   #endif
   
+ /* security_inode_init_security callback function to write xattrs */
+ typedef int (*initxattrs) (struct inode *inode,
+                          const struct xattr *xattr_array, void *fs_data);
+ 
   #ifdef CONFIG_SECURITY
   
   struct security_mnt_opts {
@@@ -1456,7 -1461,7 +1461,7 @@@ struct security_operations 
                              struct inode *new_dir, struct dentry *new_dentry);
         int (*inode_readlink) (struct dentry *dentry);
         int (*inode_follow_link) (struct dentry *dentry, struct nameidata *nd);
- -      int (*inode_permission) (struct inode *inode, int mask, unsigned flags);
+ +      int (*inode_permission) (struct inode *inode, int mask);
         int (*inode_setattr)    (struct dentry *dentry, struct iattr *attr);
         int (*inode_getattr) (struct vfsmount *mnt, struct dentry *dentry);
         int (*inode_setxattr) (struct dentry *dentry, const char *name,
@@@ -1704,8 -1709,11 +1709,11 @@@ int security_sb_parse_opts_str(char *op
   int security_inode_alloc(struct inode *inode);
   void security_inode_free(struct inode *inode);
   int security_inode_init_security(struct inode *inode, struct inode *dir,
-                                const struct qstr *qstr, char **name,
-                                void **value, size_t *len);
+                                const struct qstr *qstr,
+                                initxattrs initxattrs, void *fs_data);
+ int security_old_inode_init_security(struct inode *inode, struct inode *dir,
+                                    const struct qstr *qstr, char **name,
+                                    void **value, size_t *len);
   int security_inode_create(struct inode *dir, struct dentry *dentry, int mode);
   int security_inode_link(struct dentry *old_dentry, struct inode *dir,
                          struct dentry *new_dentry);
@@@ -1720,6 -1728,7 +1728,6 @@@ int security_inode_rename(struct inode 
   int security_inode_readlink(struct dentry *dentry);
   int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd);
   int security_inode_permission(struct inode *inode, int mask);
- -int security_inode_exec_permission(struct inode *inode, unsigned int flags);
   int security_inode_setattr(struct dentry *dentry, struct iattr *attr);
   int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry);
   int security_inode_setxattr(struct dentry *dentry, const char *name,
@@@ -2034,9 -2043,8 +2042,8 @@@ static inline void security_inode_free(
   static inline int security_inode_init_security(struct inode *inode,
                                                 struct inode *dir,
                                                 const struct qstr *qstr,
-                                               char **name,
-                                               void **value,
-                                               size_t *len)
+                                               initxattrs initxattrs,
+                                               void *fs_data)
   {
         return -EOPNOTSUPP;
   }
@@@ -2112,6 -2120,12 +2119,6 @@@ static inline int security_inode_permis
         return 0;
   }
   
- -static inline int security_inode_exec_permission(struct inode *inode,
- -                                                unsigned int flags)
- -{
- -      return 0;
- -}
- -
   static inline int security_inode_setattr(struct dentry *dentry,
                                           struct iattr *attr)
   {
diff --combined mm/shmem.c

index 32f6763f16fb82ad7068c4d73479a96583f4ef15,01c19c62d6852b82dffd091d77592a1a17990b4d..2d357729529880b29f18704edda807108f6cdc5b
--- 1/mm/shmem.c
--- 2/mm/shmem.c
+++ b/mm/shmem.c
@@@ -6,8 -6,7 +6,8 @@@
    *             2000-2001 Christoph Rohland
    *             2000-2001 SAP AG
    *             2002 Red Hat Inc.
- - * Copyright (C) 2002-2005 Hugh Dickins.
+ + * Copyright (C) 2002-2011 Hugh Dickins.
+ + * Copyright (C) 2011 Google Inc.
    * Copyright (C) 2002-2005 VERITAS Software Corporation.
    * Copyright (C) 2004 Andi Kleen, SuSE Labs
    *
@@@ -29,6 -28,7 +29,6 @@@
   #include <linux/file.h>
   #include <linux/mm.h>
   #include <linux/module.h>
- -#include <linux/percpu_counter.h>
   #include <linux/swap.h>
   
   static struct vfsmount *shm_mnt;
@@@ -51,9 -51,6 +51,9 @@@
   #include <linux/shmem_fs.h>
   #include <linux/writeback.h>
   #include <linux/blkdev.h>
+ +#include <linux/pagevec.h>
+ +#include <linux/percpu_counter.h>
+ +#include <linux/splice.h>
   #include <linux/security.h>
   #include <linux/swapops.h>
   #include <linux/mempolicy.h>
@@@ -65,17 -62,43 +65,17 @@@
   #include <linux/magic.h>
   
   #include <asm/uaccess.h>
- -#include <asm/div64.h>
   #include <asm/pgtable.h>
   
- -/*
- - * The maximum size of a shmem/tmpfs file is limited by the maximum size of
- - * its triple-indirect swap vector - see illustration at shmem_swp_entry().
- - *
- - * With 4kB page size, maximum file size is just over 2TB on a 32-bit kernel,
- - * but one eighth of that on a 64-bit kernel.  With 8kB page size, maximum
- - * file size is just over 4TB on a 64-bit kernel, but 16TB on a 32-bit kernel,
- - * MAX_LFS_FILESIZE being then more restrictive than swap vector layout.
- - *
- - * We use / and * instead of shifts in the definitions below, so that the swap
- - * vector can be tested with small even values (e.g. 20) for ENTRIES_PER_PAGE.
- - */
- -#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
- -#define ENTRIES_PER_PAGEPAGE ((unsigned long long)ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
- -
- -#define SHMSWP_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
- -#define SHMSWP_MAX_BYTES (SHMSWP_MAX_INDEX << PAGE_CACHE_SHIFT)
- -
- -#define SHMEM_MAX_BYTES  min_t(unsigned long long, SHMSWP_MAX_BYTES, MAX_LFS_FILESIZE)
- -#define SHMEM_MAX_INDEX  ((unsigned long)((SHMEM_MAX_BYTES+1) >> PAGE_CACHE_SHIFT))
- -
   #define BLOCKS_PER_PAGE  (PAGE_CACHE_SIZE/512)
   #define VM_ACCT(size)    (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
   
- -/* info->flags needs VM_flags to handle pagein/truncate races efficiently */
- -#define SHMEM_PAGEIN   VM_READ
- -#define SHMEM_TRUNCATE         VM_WRITE
- -
- -/* Definition to limit shmem_truncate's steps between cond_rescheds */
- -#define LATENCY_LIMIT  64
- -
   /* Pretend that each entry is of this size in directory's i_size */
   #define BOGO_DIRENT_SIZE 20
   
+ +/* Symlink up to this size is kmalloc'ed instead of using a swappable page */
+ +#define SHORT_SYMLINK_LEN 128
+ +
   struct shmem_xattr {
         struct list_head list;  /* anchored by shmem_inode_info->xattr_list */
         char *name;             /* xattr name */
@@@ -83,7 -106,7 +83,7 @@@
         char value[0];
   };
   
- -/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
+ +/* Flag allocation requirements to shmem_getpage */
   enum sgp_type {
         SGP_READ,       /* don't exceed i_size, don't allocate page */
         SGP_CACHE,      /* don't exceed i_size, may allocate page */
@@@ -103,14 -126,57 +103,14 @@@ static unsigned long shmem_default_max_
   }
   #endif
   
- -static int shmem_getpage(struct inode *inode, unsigned long idx,
- -                       struct page **pagep, enum sgp_type sgp, int *type);
- -
- -static inline struct page *shmem_dir_alloc(gfp_t gfp_mask)
- -{
- -      /*
- -       * The above definition of ENTRIES_PER_PAGE, and the use of
- -       * BLOCKS_PER_PAGE on indirect pages, assume PAGE_CACHE_SIZE:
- -       * might be reconsidered if it ever diverges from PAGE_SIZE.
- -       *
- -       * Mobility flags are masked out as swap vectors cannot move
- -       */
- -      return alloc_pages((gfp_mask & ~GFP_MOVABLE_MASK) | __GFP_ZERO,
- -                              PAGE_CACHE_SHIFT-PAGE_SHIFT);
- -}
- -
- -static inline void shmem_dir_free(struct page *page)
- -{
- -      __free_pages(page, PAGE_CACHE_SHIFT-PAGE_SHIFT);
- -}
- -
- -static struct page **shmem_dir_map(struct page *page)
- -{
- -      return (struct page **)kmap_atomic(page, KM_USER0);
- -}
- -
- -static inline void shmem_dir_unmap(struct page **dir)
- -{
- -      kunmap_atomic(dir, KM_USER0);
- -}
- -
- -static swp_entry_t *shmem_swp_map(struct page *page)
- -{
- -      return (swp_entry_t *)kmap_atomic(page, KM_USER1);
- -}
- -
- -static inline void shmem_swp_balance_unmap(void)
- -{
- -      /*
- -       * When passing a pointer to an i_direct entry, to code which
- -       * also handles indirect entries and so will shmem_swp_unmap,
- -       * we must arrange for the preempt count to remain in balance.
- -       * What kmap_atomic of a lowmem page does depends on config
- -       * and architecture, so pretend to kmap_atomic some lowmem page.
- -       */
- -      (void) kmap_atomic(ZERO_PAGE(0), KM_USER1);
- -}
+ +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ +      struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type);
   
- -static inline void shmem_swp_unmap(swp_entry_t *entry)
+ +static inline int shmem_getpage(struct inode *inode, pgoff_t index,
+ +      struct page **pagep, enum sgp_type sgp, int *fault_type)
   {
- -      kunmap_atomic(entry, KM_USER1);
+ +      return shmem_getpage_gfp(inode, index, pagep, sgp,
+ +                      mapping_gfp_mask(inode->i_mapping), fault_type);
   }
   
   static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
@@@ -170,6 -236,17 +170,6 @@@ static struct backing_dev_info shmem_ba
   static LIST_HEAD(shmem_swaplist);
   static DEFINE_MUTEX(shmem_swaplist_mutex);
   
- -static void shmem_free_blocks(struct inode *inode, long pages)
- -{
- -      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- -      if (sbinfo->max_blocks) {
- -              percpu_counter_add(&sbinfo->used_blocks, -pages);
- -              spin_lock(&inode->i_lock);
- -              inode->i_blocks -= pages*BLOCKS_PER_PAGE;
- -              spin_unlock(&inode->i_lock);
- -      }
- -}
- -
   static int shmem_reserve_inode(struct super_block *sb)
   {
         struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
@@@ -196,7 -273,7 +196,7 @@@ static void shmem_free_inode(struct sup
   }
   
   /**
- - * shmem_recalc_inode - recalculate the size of an inode
+ + * shmem_recalc_inode - recalculate the block usage of an inode
    * @inode: inode to recalc
    *
    * We have to calculate the free blocks since the mm can drop
@@@ -214,297 -291,474 +214,297 @@@ static void shmem_recalc_inode(struct i
   
         freed = info->alloced - info->swapped - inode->i_mapping->nrpages;
         if (freed > 0) {
+ +              struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
+ +              if (sbinfo->max_blocks)
+ +                      percpu_counter_add(&sbinfo->used_blocks, -freed);
                 info->alloced -= freed;
+ +              inode->i_blocks -= freed * BLOCKS_PER_PAGE;
                 shmem_unacct_blocks(info->flags, freed);
- -              shmem_free_blocks(inode, freed);
         }
   }
   
- -/**
- - * shmem_swp_entry - find the swap vector position in the info structure
- - * @info:  info structure for the inode
- - * @index: index of the page to find
- - * @page:  optional page to add to the structure. Has to be preset to
- - *         all zeros
- - *
- - * If there is no space allocated yet it will return NULL when
- - * page is NULL, else it will use the page for the needed block,
- - * setting it to NULL on return to indicate that it has been used.
- - *
- - * The swap vector is organized the following way:
- - *
- - * There are SHMEM_NR_DIRECT entries directly stored in the
- - * shmem_inode_info structure. So small files do not need an addional
- - * allocation.
- - *
- - * For pages with index > SHMEM_NR_DIRECT there is the pointer
- - * i_indirect which points to a page which holds in the first half
- - * doubly indirect blocks, in the second half triple indirect blocks:
- - *
- - * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
- - * following layout (for SHMEM_NR_DIRECT == 16):
- - *
- - * i_indirect -> dir --> 16-19
- - *          |      +-> 20-23
- - *          |
- - *          +-->dir2 --> 24-27
- - *          |        +-> 28-31
- - *          |        +-> 32-35
- - *          |        +-> 36-39
- - *          |
- - *          +-->dir3 --> 40-43
- - *                   +-> 44-47
- - *                   +-> 48-51
- - *                   +-> 52-55
+ +/*
+ + * Replace item expected in radix tree by a new item, while holding tree lock.
    */
- -static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, struct page **page)
- -{
- -      unsigned long offset;
- -      struct page **dir;
- -      struct page *subdir;
+ +static int shmem_radix_tree_replace(struct address_space *mapping,
+ +                      pgoff_t index, void *expected, void *replacement)
+ +{
+ +      void **pslot;
+ +      void *item = NULL;
+ +
+ +      VM_BUG_ON(!expected);
+ +      pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
+ +      if (pslot)
+ +              item = radix_tree_deref_slot_protected(pslot,
+ +                                                      &mapping->tree_lock);
+ +      if (item != expected)
+ +              return -ENOENT;
+ +      if (replacement)
+ +              radix_tree_replace_slot(pslot, replacement);
+ +      else
+ +              radix_tree_delete(&mapping->page_tree, index);
+ +      return 0;
+ +}
   
- -      if (index < SHMEM_NR_DIRECT) {
- -              shmem_swp_balance_unmap();
- -              return info->i_direct+index;
- -      }
- -      if (!info->i_indirect) {
- -              if (page) {
- -                      info->i_indirect = *page;
- -                      *page = NULL;
- -              }
- -              return NULL;                    /* need another page */
- -      }
+ +/*
+ + * Like add_to_page_cache_locked, but error if expected item has gone.
+ + */
+ +static int shmem_add_to_page_cache(struct page *page,
+ +                                 struct address_space *mapping,
+ +                                 pgoff_t index, gfp_t gfp, void *expected)
+ +{
+ +      int error = 0;
   
- -      index -= SHMEM_NR_DIRECT;
- -      offset = index % ENTRIES_PER_PAGE;
- -      index /= ENTRIES_PER_PAGE;
- -      dir = shmem_dir_map(info->i_indirect);
- -
- -      if (index >= ENTRIES_PER_PAGE/2) {
- -              index -= ENTRIES_PER_PAGE/2;
- -              dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
- -              index %= ENTRIES_PER_PAGE;
- -              subdir = *dir;
- -              if (!subdir) {
- -                      if (page) {
- -                              *dir = *page;
- -                              *page = NULL;
- -                      }
- -                      shmem_dir_unmap(dir);
- -                      return NULL;            /* need another page */
- -              }
- -              shmem_dir_unmap(dir);
- -              dir = shmem_dir_map(subdir);
- -      }
+ +      VM_BUG_ON(!PageLocked(page));
+ +      VM_BUG_ON(!PageSwapBacked(page));
   
- -      dir += index;
- -      subdir = *dir;
- -      if (!subdir) {
- -              if (!page || !(subdir = *page)) {
- -                      shmem_dir_unmap(dir);
- -                      return NULL;            /* need a page */
+ +      if (!expected)
+ +              error = radix_tree_preload(gfp & GFP_RECLAIM_MASK);
+ +      if (!error) {
+ +              page_cache_get(page);
+ +              page->mapping = mapping;
+ +              page->index = index;
+ +
+ +              spin_lock_irq(&mapping->tree_lock);
+ +              if (!expected)
+ +                      error = radix_tree_insert(&mapping->page_tree,
+ +                                                      index, page);
+ +              else
+ +                      error = shmem_radix_tree_replace(mapping, index,
+ +                                                      expected, page);
+ +              if (!error) {
+ +                      mapping->nrpages++;
+ +                      __inc_zone_page_state(page, NR_FILE_PAGES);
+ +                      __inc_zone_page_state(page, NR_SHMEM);
+ +                      spin_unlock_irq(&mapping->tree_lock);
+ +              } else {
+ +                      page->mapping = NULL;
+ +                      spin_unlock_irq(&mapping->tree_lock);
+ +                      page_cache_release(page);
                 }
- -              *dir = subdir;
- -              *page = NULL;
+ +              if (!expected)
+ +                      radix_tree_preload_end();
         }
- -      shmem_dir_unmap(dir);
- -      return shmem_swp_map(subdir) + offset;
+ +      if (error)
+ +              mem_cgroup_uncharge_cache_page(page);
+ +      return error;
   }
   
- -static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, unsigned long value)
+ +/*
+ + * Like delete_from_page_cache, but substitutes swap for page.
+ + */
+ +static void shmem_delete_from_page_cache(struct page *page, void *radswap)
   {
- -      long incdec = value? 1: -1;
+ +      struct address_space *mapping = page->mapping;
+ +      int error;
   
- -      entry->val = value;
- -      info->swapped += incdec;
- -      if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
- -              struct page *page = kmap_atomic_to_page(entry);
- -              set_page_private(page, page_private(page) + incdec);
- -      }
+ +      spin_lock_irq(&mapping->tree_lock);
+ +      error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+ +      page->mapping = NULL;
+ +      mapping->nrpages--;
+ +      __dec_zone_page_state(page, NR_FILE_PAGES);
+ +      __dec_zone_page_state(page, NR_SHMEM);
+ +      spin_unlock_irq(&mapping->tree_lock);
+ +      page_cache_release(page);
+ +      BUG_ON(error);
   }
   
- -/**
- - * shmem_swp_alloc - get the position of the swap entry for the page.
- - * @info:     info structure for the inode
- - * @index:    index of the page to find
- - * @sgp:      check and recheck i_size? skip allocation?
- - *
- - * If the entry does not exist, allocate it.
+ +/*
+ + * Like find_get_pages, but collecting swap entries as well as pages.
    */
- -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
- -{
- -      struct inode *inode = &info->vfs_inode;
- -      struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- -      struct page *page = NULL;
- -      swp_entry_t *entry;
- -
- -      if (sgp != SGP_WRITE &&
- -          ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- -              return ERR_PTR(-EINVAL);
- -
- -      while (!(entry = shmem_swp_entry(info, index, &page))) {
- -              if (sgp == SGP_READ)
- -                      return shmem_swp_map(ZERO_PAGE(0));
- -              /*
- -               * Test used_blocks against 1 less max_blocks, since we have 1 data
- -               * page (and perhaps indirect index pages) yet to allocate:
- -               * a waste to allocate index if we cannot allocate data.
- -               */
- -              if (sbinfo->max_blocks) {
- -                      if (percpu_counter_compare(&sbinfo->used_blocks,
- -                                              sbinfo->max_blocks - 1) >= 0)
- -                              return ERR_PTR(-ENOSPC);
- -                      percpu_counter_inc(&sbinfo->used_blocks);
- -                      spin_lock(&inode->i_lock);
- -                      inode->i_blocks += BLOCKS_PER_PAGE;
- -                      spin_unlock(&inode->i_lock);
+ +static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
+ +                                      pgoff_t start, unsigned int nr_pages,
+ +                                      struct page **pages, pgoff_t *indices)
+ +{
+ +      unsigned int i;
+ +      unsigned int ret;
+ +      unsigned int nr_found;
+ +
+ +      rcu_read_lock();
+ +restart:
+ +      nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree,
+ +                              (void ***)pages, indices, start, nr_pages);
+ +      ret = 0;
+ +      for (i = 0; i < nr_found; i++) {
+ +              struct page *page;
+ +repeat:
+ +              page = radix_tree_deref_slot((void **)pages[i]);
+ +              if (unlikely(!page))
+ +                      continue;
+ +              if (radix_tree_exception(page)) {
+ +                      if (radix_tree_deref_retry(page))
+ +                              goto restart;
+ +                      /*
+ +                       * Otherwise, we must be storing a swap entry
+ +                       * here as an exceptional entry: so return it
+ +                       * without attempting to raise page count.
+ +                       */
+ +                      goto export;
                 }
+ +              if (!page_cache_get_speculative(page))
+ +                      goto repeat;
   
- -              spin_unlock(&info->lock);
- -              page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping));
- -              spin_lock(&info->lock);
- -
- -              if (!page) {
- -                      shmem_free_blocks(inode, 1);
- -                      return ERR_PTR(-ENOMEM);
- -              }
- -              if (sgp != SGP_WRITE &&
- -                  ((loff_t) index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
- -                      entry = ERR_PTR(-EINVAL);
- -                      break;
+ +              /* Has the page moved? */
+ +              if (unlikely(page != *((void **)pages[i]))) {
+ +                      page_cache_release(page);
+ +                      goto repeat;
                 }
- -              if (info->next_index <= index)
- -                      info->next_index = index + 1;
- -      }
- -      if (page) {
- -              /* another task gave its page, or truncated the file */
- -              shmem_free_blocks(inode, 1);
- -              shmem_dir_free(page);
- -      }
- -      if (info->next_index <= index && !IS_ERR(entry))
- -              info->next_index = index + 1;
- -      return entry;
+ +export:
+ +              indices[ret] = indices[i];
+ +              pages[ret] = page;
+ +              ret++;
+ +      }
+ +      if (unlikely(!ret && nr_found))
+ +              goto restart;
+ +      rcu_read_unlock();
+ +      return ret;
   }
   
- -/**
- - * shmem_free_swp - free some swap entries in a directory
- - * @dir:        pointer to the directory
- - * @edir:       pointer after last entry of the directory
- - * @punch_lock: pointer to spinlock when needed for the holepunch case
+ +/*
+ + * Remove swap entry from radix tree, free the swap and its page cache.
    */
- -static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir,
- -                                              spinlock_t *punch_lock)
- -{
- -      spinlock_t *punch_unlock = NULL;
- -      swp_entry_t *ptr;
- -      int freed = 0;
- -
- -      for (ptr = dir; ptr < edir; ptr++) {
- -              if (ptr->val) {
- -                      if (unlikely(punch_lock)) {
- -                              punch_unlock = punch_lock;
- -                              punch_lock = NULL;
- -                              spin_lock(punch_unlock);
- -                              if (!ptr->val)
- -                                      continue;
- -                      }
- -                      free_swap_and_cache(*ptr);
- -                      *ptr = (swp_entry_t){0};
- -                      freed++;
- -              }
- -      }
- -      if (punch_unlock)
- -              spin_unlock(punch_unlock);
- -      return freed;
- -}
- -
- -static int shmem_map_and_free_swp(struct page *subdir, int offset,
- -              int limit, struct page ***dir, spinlock_t *punch_lock)
- -{
- -      swp_entry_t *ptr;
- -      int freed = 0;
- -
- -      ptr = shmem_swp_map(subdir);
- -      for (; offset < limit; offset += LATENCY_LIMIT) {
- -              int size = limit - offset;
- -              if (size > LATENCY_LIMIT)
- -                      size = LATENCY_LIMIT;
- -              freed += shmem_free_swp(ptr+offset, ptr+offset+size,
- -                                                      punch_lock);
- -              if (need_resched()) {
- -                      shmem_swp_unmap(ptr);
- -                      if (*dir) {
- -                              shmem_dir_unmap(*dir);
- -                              *dir = NULL;
- -                      }
- -                      cond_resched();
- -                      ptr = shmem_swp_map(subdir);
- -              }
- -      }
- -      shmem_swp_unmap(ptr);
- -      return freed;
+ +static int shmem_free_swap(struct address_space *mapping,
+ +                         pgoff_t index, void *radswap)
+ +{
+ +      int error;
+ +
+ +      spin_lock_irq(&mapping->tree_lock);
+ +      error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+ +      spin_unlock_irq(&mapping->tree_lock);
+ +      if (!error)
+ +              free_swap_and_cache(radix_to_swp_entry(radswap));
+ +      return error;
   }
   
- -static void shmem_free_pages(struct list_head *next)
+ +/*
+ + * Pagevec may contain swap entries, so shuffle up pages before releasing.
+ + */
+ +static void shmem_pagevec_release(struct pagevec *pvec)
   {
- -      struct page *page;
- -      int freed = 0;
- -
- -      do {
- -              page = container_of(next, struct page, lru);
- -              next = next->next;
- -              shmem_dir_free(page);
- -              freed++;
- -              if (freed >= LATENCY_LIMIT) {
- -                      cond_resched();
- -                      freed = 0;
- -              }
- -      } while (next);
+ +      int i, j;
+ +
+ +      for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
+ +              struct page *page = pvec->pages[i];
+ +              if (!radix_tree_exceptional_entry(page))
+ +                      pvec->pages[j++] = page;
+ +      }
+ +      pvec->nr = j;
+ +      pagevec_release(pvec);
   }
   
- -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+ +/*
+ + * Remove range of pages and swap entries from radix tree, and free them.
+ + */
+ +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
   {
+ +      struct address_space *mapping = inode->i_mapping;
         struct shmem_inode_info *info = SHMEM_I(inode);
- -      unsigned long idx;
- -      unsigned long size;
- -      unsigned long limit;
- -      unsigned long stage;
- -      unsigned long diroff;
- -      struct page **dir;
- -      struct page *topdir;
- -      struct page *middir;
- -      struct page *subdir;
- -      swp_entry_t *ptr;
- -      LIST_HEAD(pages_to_free);
- -      long nr_pages_to_free = 0;
+ +      pgoff_t start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ +      unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
+ +      pgoff_t end = (lend >> PAGE_CACHE_SHIFT);
+ +      struct pagevec pvec;
+ +      pgoff_t indices[PAGEVEC_SIZE];
         long nr_swaps_freed = 0;
- -      int offset;
- -      int freed;
- -      int punch_hole;
- -      spinlock_t *needs_lock;
- -      spinlock_t *punch_lock;
- -      unsigned long upper_limit;
+ +      pgoff_t index;
+ +      int i;
   
- -      truncate_inode_pages_range(inode->i_mapping, start, end);
+ +      BUG_ON((lend & (PAGE_CACHE_SIZE - 1)) != (PAGE_CACHE_SIZE - 1));
   
- -      inode->i_ctime = inode->i_mtime = CURRENT_TIME;
- -      idx = (start + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- -      if (idx >= info->next_index)
- -              return;
+ +      pagevec_init(&pvec, 0);
+ +      index = start;
+ +      while (index <= end) {
+ +              pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ +                      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ +                                                      pvec.pages, indices);
+ +              if (!pvec.nr)
+ +                      break;
+ +              mem_cgroup_uncharge_start();
+ +              for (i = 0; i < pagevec_count(&pvec); i++) {
+ +                      struct page *page = pvec.pages[i];
   
- -      spin_lock(&info->lock);
- -      info->flags |= SHMEM_TRUNCATE;
- -      if (likely(end == (loff_t) -1)) {
- -              limit = info->next_index;
- -              upper_limit = SHMEM_MAX_INDEX;
- -              info->next_index = idx;
- -              needs_lock = NULL;
- -              punch_hole = 0;
- -      } else {
- -              if (end + 1 >= inode->i_size) { /* we may free a little more */
- -                      limit = (inode->i_size + PAGE_CACHE_SIZE - 1) >>
- -                                                      PAGE_CACHE_SHIFT;
- -                      upper_limit = SHMEM_MAX_INDEX;
- -              } else {
- -                      limit = (end + 1) >> PAGE_CACHE_SHIFT;
- -                      upper_limit = limit;
- -              }
- -              needs_lock = &info->lock;
- -              punch_hole = 1;
- -      }
+ +                      index = indices[i];
+ +                      if (index > end)
+ +                              break;
   
- -      topdir = info->i_indirect;
- -      if (topdir && idx <= SHMEM_NR_DIRECT && !punch_hole) {
- -              info->i_indirect = NULL;
- -              nr_pages_to_free++;
- -              list_add(&topdir->lru, &pages_to_free);
+ +                      if (radix_tree_exceptional_entry(page)) {
+ +                              nr_swaps_freed += !shmem_free_swap(mapping,
+ +                                                              index, page);
+ +                              continue;
+ +                      }
+ +
+ +                      if (!trylock_page(page))
+ +                              continue;
+ +                      if (page->mapping == mapping) {
+ +                              VM_BUG_ON(PageWriteback(page));
+ +                              truncate_inode_page(mapping, page);
+ +                      }
+ +                      unlock_page(page);
+ +              }
+ +              shmem_pagevec_release(&pvec);
+ +              mem_cgroup_uncharge_end();
+ +              cond_resched();
+ +              index++;
         }
- -      spin_unlock(&info->lock);
   
- -      if (info->swapped && idx < SHMEM_NR_DIRECT) {
- -              ptr = info->i_direct;
- -              size = limit;
- -              if (size > SHMEM_NR_DIRECT)
- -                      size = SHMEM_NR_DIRECT;
- -              nr_swaps_freed = shmem_free_swp(ptr+idx, ptr+size, needs_lock);
+ +      if (partial) {
+ +              struct page *page = NULL;
+ +              shmem_getpage(inode, start - 1, &page, SGP_READ, NULL);
+ +              if (page) {
+ +                      zero_user_segment(page, partial, PAGE_CACHE_SIZE);
+ +                      set_page_dirty(page);
+ +                      unlock_page(page);
+ +                      page_cache_release(page);
+ +              }
         }
   
- -      /*
- -       * If there are no indirect blocks or we are punching a hole
- -       * below indirect blocks, nothing to be done.
- -       */
- -      if (!topdir || limit <= SHMEM_NR_DIRECT)
- -              goto done2;
+ +      index = start;
+ +      for ( ; ; ) {
+ +              cond_resched();
+ +              pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+ +                      min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
+ +                                                      pvec.pages, indices);
+ +              if (!pvec.nr) {
+ +                      if (index == start)
+ +                              break;
+ +                      index = start;
+ +                      continue;
+ +              }
+ +              if (index == start && indices[0] > end) {
+ +                      shmem_pagevec_release(&pvec);
+ +                      break;
+ +              }
+ +              mem_cgroup_uncharge_start();
+ +              for (i = 0; i < pagevec_count(&pvec); i++) {
+ +                      struct page *page = pvec.pages[i];
   
- -      /*
- -       * The truncation case has already dropped info->lock, and we're safe
- -       * because i_size and next_index have already been lowered, preventing
- -       * access beyond.  But in the punch_hole case, we still need to take
- -       * the lock when updating the swap directory, because there might be
- -       * racing accesses by shmem_getpage(SGP_CACHE), shmem_unuse_inode or
- -       * shmem_writepage.  However, whenever we find we can remove a whole
- -       * directory page (not at the misaligned start or end of the range),
- -       * we first NULLify its pointer in the level above, and then have no
- -       * need to take the lock when updating its contents: needs_lock and
- -       * punch_lock (either pointing to info->lock or NULL) manage this.
- -       */
+ +                      index = indices[i];
+ +                      if (index > end)
+ +                              break;
   
- -      upper_limit -= SHMEM_NR_DIRECT;
- -      limit -= SHMEM_NR_DIRECT;
- -      idx = (idx > SHMEM_NR_DIRECT)? (idx - SHMEM_NR_DIRECT): 0;
- -      offset = idx % ENTRIES_PER_PAGE;
- -      idx -= offset;
- -
- -      dir = shmem_dir_map(topdir);
- -      stage = ENTRIES_PER_PAGEPAGE/2;
- -      if (idx < ENTRIES_PER_PAGEPAGE/2) {
- -              middir = topdir;
- -              diroff = idx/ENTRIES_PER_PAGE;
- -      } else {
- -              dir += ENTRIES_PER_PAGE/2;
- -              dir += (idx - ENTRIES_PER_PAGEPAGE/2)/ENTRIES_PER_PAGEPAGE;
- -              while (stage <= idx)
- -                      stage += ENTRIES_PER_PAGEPAGE;
- -              middir = *dir;
- -              if (*dir) {
- -                      diroff = ((idx - ENTRIES_PER_PAGEPAGE/2) %
- -                              ENTRIES_PER_PAGEPAGE) / ENTRIES_PER_PAGE;
- -                      if (!diroff && !offset && upper_limit >= stage) {
- -                              if (needs_lock) {
- -                                      spin_lock(needs_lock);
- -                                      *dir = NULL;
- -                                      spin_unlock(needs_lock);
- -                                      needs_lock = NULL;
- -                              } else
- -                                      *dir = NULL;
- -                              nr_pages_to_free++;
- -                              list_add(&middir->lru, &pages_to_free);
+ +                      if (radix_tree_exceptional_entry(page)) {
+ +                              nr_swaps_freed += !shmem_free_swap(mapping,
+ +                                                              index, page);
+ +                              continue;
                         }
- -                      shmem_dir_unmap(dir);
- -                      dir = shmem_dir_map(middir);
- -              } else {
- -                      diroff = 0;
- -                      offset = 0;
- -                      idx = stage;
- -              }
- -      }
   
- -      for (; idx < limit; idx += ENTRIES_PER_PAGE, diroff++) {
- -              if (unlikely(idx == stage)) {
- -                      shmem_dir_unmap(dir);
- -                      dir = shmem_dir_map(topdir) +
- -                          ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- -                      while (!*dir) {
- -                              dir++;
- -                              idx += ENTRIES_PER_PAGEPAGE;
- -                              if (idx >= limit)
- -                                      goto done1;
+ +                      lock_page(page);
+ +                      if (page->mapping == mapping) {
+ +                              VM_BUG_ON(PageWriteback(page));
+ +                              truncate_inode_page(mapping, page);
                         }
- -                      stage = idx + ENTRIES_PER_PAGEPAGE;
- -                      middir = *dir;
- -                      if (punch_hole)
- -                              needs_lock = &info->lock;
- -                      if (upper_limit >= stage) {
- -                              if (needs_lock) {
- -                                      spin_lock(needs_lock);
- -                                      *dir = NULL;
- -                                      spin_unlock(needs_lock);
- -                                      needs_lock = NULL;
- -                              } else
- -                                      *dir = NULL;
- -                              nr_pages_to_free++;
- -                              list_add(&middir->lru, &pages_to_free);
- -                      }
- -                      shmem_dir_unmap(dir);
- -                      cond_resched();
- -                      dir = shmem_dir_map(middir);
- -                      diroff = 0;
- -              }
- -              punch_lock = needs_lock;
- -              subdir = dir[diroff];
- -              if (subdir && !offset && upper_limit-idx >= ENTRIES_PER_PAGE) {
- -                      if (needs_lock) {
- -                              spin_lock(needs_lock);
- -                              dir[diroff] = NULL;
- -                              spin_unlock(needs_lock);
- -                              punch_lock = NULL;
- -                      } else
- -                              dir[diroff] = NULL;
- -                      nr_pages_to_free++;
- -                      list_add(&subdir->lru, &pages_to_free);
- -              }
- -              if (subdir && page_private(subdir) /* has swap entries */) {
- -                      size = limit - idx;
- -                      if (size > ENTRIES_PER_PAGE)
- -                              size = ENTRIES_PER_PAGE;
- -                      freed = shmem_map_and_free_swp(subdir,
- -                                      offset, size, &dir, punch_lock);
- -                      if (!dir)
- -                              dir = shmem_dir_map(middir);
- -                      nr_swaps_freed += freed;
- -                      if (offset || punch_lock) {
- -                              spin_lock(&info->lock);
- -                              set_page_private(subdir,
- -                                      page_private(subdir) - freed);
- -                              spin_unlock(&info->lock);
- -                      } else
- -                              BUG_ON(page_private(subdir) != freed);
+ +                      unlock_page(page);
                 }
- -              offset = 0;
- -      }
- -done1:
- -      shmem_dir_unmap(dir);
- -done2:
- -      if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
- -              /*
- -               * Call truncate_inode_pages again: racing shmem_unuse_inode
- -               * may have swizzled a page in from swap since
- -               * truncate_pagecache or generic_delete_inode did it, before we
- -               * lowered next_index.  Also, though shmem_getpage checks
- -               * i_size before adding to cache, no recheck after: so fix the
- -               * narrow window there too.
- -               */
- -              truncate_inode_pages_range(inode->i_mapping, start, end);
+ +              shmem_pagevec_release(&pvec);
+ +              mem_cgroup_uncharge_end();
+ +              index++;
         }
   
         spin_lock(&info->lock);
- -      info->flags &= ~SHMEM_TRUNCATE;
         info->swapped -= nr_swaps_freed;
- -      if (nr_pages_to_free)
- -              shmem_free_blocks(inode, nr_pages_to_free);
         shmem_recalc_inode(inode);
         spin_unlock(&info->lock);
   
- -      /*
- -       * Empty swap vector directory pages to be freed?
- -       */
- -      if (!list_empty(&pages_to_free)) {
- -              pages_to_free.prev->next = NULL;
- -              shmem_free_pages(pages_to_free.next);
- -      }
+ +      inode->i_ctime = inode->i_mtime = CURRENT_TIME;
   }
   EXPORT_SYMBOL_GPL(shmem_truncate_range);
   
@@@ -520,7 -774,37 +520,7 @@@ static int shmem_setattr(struct dentry 
         if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
                 loff_t oldsize = inode->i_size;
                 loff_t newsize = attr->ia_size;
- -              struct page *page = NULL;
   
- -              if (newsize < oldsize) {
- -                      /*
- -                       * If truncating down to a partial page, then
- -                       * if that page is already allocated, hold it
- -                       * in memory until the truncation is over, so
- -                       * truncate_partial_page cannot miss it were
- -                       * it assigned to swap.
- -                       */
- -                      if (newsize & (PAGE_CACHE_SIZE-1)) {
- -                              (void) shmem_getpage(inode,
- -                                      newsize >> PAGE_CACHE_SHIFT,
- -                                              &page, SGP_READ, NULL);
- -                              if (page)
- -                                      unlock_page(page);
- -                      }
- -                      /*
- -                       * Reset SHMEM_PAGEIN flag so that shmem_truncate can
- -                       * detect if any pages might have been added to cache
- -                       * after truncate_inode_pages.  But we needn't bother
- -                       * if it's being fully truncated to zero-length: the
- -                       * nrpages check is efficient enough in that case.
- -                       */
- -                      if (newsize) {
- -                              struct shmem_inode_info *info = SHMEM_I(inode);
- -                              spin_lock(&info->lock);
- -                              info->flags &= ~SHMEM_PAGEIN;
- -                              spin_unlock(&info->lock);
- -                      }
- -              }
                 if (newsize != oldsize) {
                         i_size_write(inode, newsize);
                         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
@@@ -532,6 -816,8 +532,6 @@@
                         /* unmap again to remove racily COWed private pages */
                         unmap_mapping_range(inode->i_mapping, holebegin, 0, 1);
                 }
- -              if (page)
- -                      page_cache_release(page);
         }
   
         setattr_copy(inode, attr);
@@@ -556,8 -842,7 +556,8 @@@ static void shmem_evict_inode(struct in
                         list_del_init(&info->swaplist);
                         mutex_unlock(&shmem_swaplist_mutex);
                 }
- -      }
+ +      } else
+ +              kfree(info->symlink);
   
         list_for_each_entry_safe(xattr, nxattr, &info->xattr_list, list) {
                 kfree(xattr->name);
@@@ -568,27 -853,106 +568,27 @@@
         end_writeback(inode);
   }
   
- -static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
- -{
- -      swp_entry_t *ptr;
- -
- -      for (ptr = dir; ptr < edir; ptr++) {
- -              if (ptr->val == entry.val)
- -                      return ptr - dir;
- -      }
- -      return -1;
- -}
- -
- -static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
+ +/*
+ + * If swap found in inode, free it and move page from swapcache to filecache.
+ + */
+ +static int shmem_unuse_inode(struct shmem_inode_info *info,
+ +                           swp_entry_t swap, struct page *page)
   {
- -      struct address_space *mapping;
- -      unsigned long idx;
- -      unsigned long size;
- -      unsigned long limit;
- -      unsigned long stage;
- -      struct page **dir;
- -      struct page *subdir;
- -      swp_entry_t *ptr;
- -      int offset;
+ +      struct address_space *mapping = info->vfs_inode.i_mapping;
+ +      void *radswap;
+ +      pgoff_t index;
         int error;
   
- -      idx = 0;
- -      ptr = info->i_direct;
- -      spin_lock(&info->lock);
- -      if (!info->swapped) {
- -              list_del_init(&info->swaplist);
- -              goto lost2;
- -      }
- -      limit = info->next_index;
- -      size = limit;
- -      if (size > SHMEM_NR_DIRECT)
- -              size = SHMEM_NR_DIRECT;
- -      offset = shmem_find_swp(entry, ptr, ptr+size);
- -      if (offset >= 0) {
- -              shmem_swp_balance_unmap();
- -              goto found;
- -      }
- -      if (!info->i_indirect)
- -              goto lost2;
- -
- -      dir = shmem_dir_map(info->i_indirect);
- -      stage = SHMEM_NR_DIRECT + ENTRIES_PER_PAGEPAGE/2;
- -
- -      for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
- -              if (unlikely(idx == stage)) {
- -                      shmem_dir_unmap(dir-1);
- -                      if (cond_resched_lock(&info->lock)) {
- -                              /* check it has not been truncated */
- -                              if (limit > info->next_index) {
- -                                      limit = info->next_index;
- -                                      if (idx >= limit)
- -                                              goto lost2;
- -                              }
- -                      }
- -                      dir = shmem_dir_map(info->i_indirect) +
- -                          ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
- -                      while (!*dir) {
- -                              dir++;
- -                              idx += ENTRIES_PER_PAGEPAGE;
- -                              if (idx >= limit)
- -                                      goto lost1;
- -                      }
- -                      stage = idx + ENTRIES_PER_PAGEPAGE;
- -                      subdir = *dir;
- -                      shmem_dir_unmap(dir);
- -                      dir = shmem_dir_map(subdir);
- -              }
- -              subdir = *dir;
- -              if (subdir && page_private(subdir)) {
- -                      ptr = shmem_swp_map(subdir);
- -                      size = limit - idx;
- -                      if (size > ENTRIES_PER_PAGE)
- -                              size = ENTRIES_PER_PAGE;
- -                      offset = shmem_find_swp(entry, ptr, ptr+size);
- -                      shmem_swp_unmap(ptr);
- -                      if (offset >= 0) {
- -                              shmem_dir_unmap(dir);
- -                              ptr = shmem_swp_map(subdir);
- -                              goto found;
- -                      }
- -              }
- -      }
- -lost1:
- -      shmem_dir_unmap(dir-1);
- -lost2:
- -      spin_unlock(&info->lock);
- -      return 0;
- -found:
- -      idx += offset;
- -      ptr += offset;
+ +      radswap = swp_to_radix_entry(swap);
+ +      index = radix_tree_locate_item(&mapping->page_tree, radswap);
+ +      if (index == -1)
+ +              return 0;
   
         /*
          * Move _head_ to start search for next from here.
          * But be careful: shmem_evict_inode checks list_empty without taking
          * mutex, and there's an instant in list_move_tail when info->swaplist
- -       * would appear empty, if it were the only one on shmem_swaplist.  We
- -       * could avoid doing it if inode NULL; or use this minor optimization.
+ +       * would appear empty, if it were the only one on shmem_swaplist.
          */
         if (shmem_swaplist.next != &info->swaplist)
                 list_move_tail(&shmem_swaplist, &info->swaplist);
@@@ -598,34 -962,42 +598,34 @@@
          * but also to hold up shmem_evict_inode(): so inode cannot be freed
          * beneath us (pagelock doesn't help until the page is in pagecache).
          */
- -      mapping = info->vfs_inode.i_mapping;
- -      error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
+ +      error = shmem_add_to_page_cache(page, mapping, index,
+ +                                              GFP_NOWAIT, radswap);
         /* which does mem_cgroup_uncharge_cache_page on error */
   
- -      if (error == -EEXIST) {
- -              struct page *filepage = find_get_page(mapping, idx);
- -              error = 1;
- -              if (filepage) {
- -                      /*
- -                       * There might be a more uptodate page coming down
- -                       * from a stacked writepage: forget our swappage if so.
- -                       */
- -                      if (PageUptodate(filepage))
- -                              error = 0;
- -                      page_cache_release(filepage);
- -              }
- -      }
- -      if (!error) {
+ +      if (error != -ENOMEM) {
+ +              /*
+ +               * Truncation and eviction use free_swap_and_cache(), which
+ +               * only does trylock page: if we raced, best clean up here.
+ +               */
                 delete_from_swap_cache(page);
                 set_page_dirty(page);
- -              info->flags |= SHMEM_PAGEIN;
- -              shmem_swp_set(info, ptr, 0);
- -              swap_free(entry);
+ +              if (!error) {
+ +                      spin_lock(&info->lock);
+ +                      info->swapped--;
+ +                      spin_unlock(&info->lock);
+ +                      swap_free(swap);
+ +              }
                 error = 1;      /* not an error, but entry was found */
         }
- -      shmem_swp_unmap(ptr);
- -      spin_unlock(&info->lock);
         return error;
   }
   
   /*
- - * shmem_unuse() search for an eventually swapped out shmem page.
+ + * Search through swapped inodes to find and replace swap by page.
    */
- -int shmem_unuse(swp_entry_t entry, struct page *page)
+ +int shmem_unuse(swp_entry_t swap, struct page *page)
   {
- -      struct list_head *p, *next;
+ +      struct list_head *this, *next;
         struct shmem_inode_info *info;
         int found = 0;
         int error;
@@@ -634,25 -1006,32 +634,25 @@@
          * Charge page using GFP_KERNEL while we can wait, before taking
          * the shmem_swaplist_mutex which might hold up shmem_writepage().
          * Charged back to the user (not to caller) when swap account is used.
- -       * add_to_page_cache() will be called with GFP_NOWAIT.
          */
         error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
         if (error)
                 goto out;
- -      /*
- -       * Try to preload while we can wait, to not make a habit of
- -       * draining atomic reserves; but don't latch on to this cpu,
- -       * it's okay if sometimes we get rescheduled after this.
- -       */
- -      error = radix_tree_preload(GFP_KERNEL);
- -      if (error)
- -              goto uncharge;
- -      radix_tree_preload_end();
+ +      /* No radix_tree_preload: swap entry keeps a place for page in tree */
   
         mutex_lock(&shmem_swaplist_mutex);
- -      list_for_each_safe(p, next, &shmem_swaplist) {
- -              info = list_entry(p, struct shmem_inode_info, swaplist);
- -              found = shmem_unuse_inode(info, entry, page);
+ +      list_for_each_safe(this, next, &shmem_swaplist) {
+ +              info = list_entry(this, struct shmem_inode_info, swaplist);
+ +              if (info->swapped)
+ +                      found = shmem_unuse_inode(info, swap, page);
+ +              else
+ +                      list_del_init(&info->swaplist);
                 cond_resched();
                 if (found)
                         break;
         }
         mutex_unlock(&shmem_swaplist_mutex);
   
- -uncharge:
         if (!found)
                 mem_cgroup_uncharge_cache_page(page);
         if (found < 0)
@@@ -669,10 -1048,10 +669,10 @@@ out
   static int shmem_writepage(struct page *page, struct writeback_control *wbc)
   {
         struct shmem_inode_info *info;
- -      swp_entry_t *entry, swap;
         struct address_space *mapping;
- -      unsigned long index;
         struct inode *inode;
+ +      swp_entry_t swap;
+ +      pgoff_t index;
   
         BUG_ON(!PageLocked(page));
         mapping = page->mapping;
@@@ -687,46 -1066,69 +687,46 @@@
         /*
          * shmem_backing_dev_info's capabilities prevent regular writeback or
          * sync from ever calling shmem_writepage; but a stacking filesystem
- -       * may use the ->writepage of its underlying filesystem, in which case
+ +       * might use ->writepage of its underlying filesystem, in which case
          * tmpfs should write out to swap only in response to memory pressure,
- -       * and not for the writeback threads or sync.  However, in those cases,
- -       * we do still want to check if there's a redundant swappage to be
- -       * discarded.
+ +       * and not for the writeback threads or sync.
          */
- -      if (wbc->for_reclaim)
- -              swap = get_swap_page();
- -      else
- -              swap.val = 0;
+ +      if (!wbc->for_reclaim) {
+ +              WARN_ON_ONCE(1);        /* Still happens? Tell us about it! */
+ +              goto redirty;
+ +      }
+ +      swap = get_swap_page();
+ +      if (!swap.val)
+ +              goto redirty;
   
         /*
          * Add inode to shmem_unuse()'s list of swapped-out inodes,
- -       * if it's not already there.  Do it now because we cannot take
- -       * mutex while holding spinlock, and must do so before the page
- -       * is moved to swap cache, when its pagelock no longer protects
+ +       * if it's not already there.  Do it now before the page is
+ +       * moved to swap cache, when its pagelock no longer protects
          * the inode from eviction.  But don't unlock the mutex until
- -       * we've taken the spinlock, because shmem_unuse_inode() will
- -       * prune a !swapped inode from the swaplist under both locks.
+ +       * we've incremented swapped, because shmem_unuse_inode() will
+ +       * prune a !swapped inode from the swaplist under this mutex.
          */
- -      if (swap.val) {
- -              mutex_lock(&shmem_swaplist_mutex);
- -              if (list_empty(&info->swaplist))
- -                      list_add_tail(&info->swaplist, &shmem_swaplist);
- -      }
- -
- -      spin_lock(&info->lock);
- -      if (swap.val)
- -              mutex_unlock(&shmem_swaplist_mutex);
- -
- -      if (index >= info->next_index) {
- -              BUG_ON(!(info->flags & SHMEM_TRUNCATE));
- -              goto unlock;
- -      }
- -      entry = shmem_swp_entry(info, index, NULL);
- -      if (entry->val) {
- -              /*
- -               * The more uptodate page coming down from a stacked
- -               * writepage should replace our old swappage.
- -               */
- -              free_swap_and_cache(*entry);
- -              shmem_swp_set(info, entry, 0);
- -      }
- -      shmem_recalc_inode(inode);
+ +      mutex_lock(&shmem_swaplist_mutex);
+ +      if (list_empty(&info->swaplist))
+ +              list_add_tail(&info->swaplist, &shmem_swaplist);
   
- -      if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
- -              delete_from_page_cache(page);
- -              shmem_swp_set(info, entry, swap.val);
- -              shmem_swp_unmap(entry);
+ +      if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
                 swap_shmem_alloc(swap);
+ +              shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+ +
+ +              spin_lock(&info->lock);
+ +              info->swapped++;
+ +              shmem_recalc_inode(inode);
                 spin_unlock(&info->lock);
+ +
+ +              mutex_unlock(&shmem_swaplist_mutex);
                 BUG_ON(page_mapped(page));
                 swap_writepage(page, wbc);
                 return 0;
         }
   
- -      shmem_swp_unmap(entry);
- -unlock:
- -      spin_unlock(&info->lock);
- -      /*
- -       * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- -       * clear SWAP_HAS_CACHE flag.
- -       */
+ +      mutex_unlock(&shmem_swaplist_mutex);
         swapcache_free(swap, NULL);
   redirty:
         set_page_dirty(page);
@@@ -763,33 -1165,35 +763,33 @@@ static struct mempolicy *shmem_get_sbmp
   }
   #endif /* CONFIG_TMPFS */
   
- -static struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
- -                      struct shmem_inode_info *info, unsigned long idx)
+ +static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ +                      struct shmem_inode_info *info, pgoff_t index)
   {
         struct mempolicy mpol, *spol;
         struct vm_area_struct pvma;
- -      struct page *page;
   
         spol = mpol_cond_copy(&mpol,
- -                              mpol_shared_policy_lookup(&info->policy, idx));
+ +                      mpol_shared_policy_lookup(&info->policy, index));
   
         /* Create a pseudo vma that just contains the policy */
         pvma.vm_start = 0;
- -      pvma.vm_pgoff = idx;
+ +      pvma.vm_pgoff = index;
         pvma.vm_ops = NULL;
         pvma.vm_policy = spol;
- -      page = swapin_readahead(entry, gfp, &pvma, 0);
- -      return page;
+ +      return swapin_readahead(swap, gfp, &pvma, 0);
   }
   
   static struct page *shmem_alloc_page(gfp_t gfp,
- -                      struct shmem_inode_info *info, unsigned long idx)
+ +                      struct shmem_inode_info *info, pgoff_t index)
   {
         struct vm_area_struct pvma;
   
         /* Create a pseudo vma that just contains the policy */
         pvma.vm_start = 0;
- -      pvma.vm_pgoff = idx;
+ +      pvma.vm_pgoff = index;
         pvma.vm_ops = NULL;
- -      pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, idx);
+ +      pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index);
   
         /*
          * alloc_page_vma() will drop the shared policy reference
@@@ -798,19 -1202,19 +798,19 @@@
   }
   #else /* !CONFIG_NUMA */
   #ifdef CONFIG_TMPFS
- -static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *p)
+ +static inline void shmem_show_mpol(struct seq_file *seq, struct mempolicy *mpol)
   {
   }
   #endif /* CONFIG_TMPFS */
   
- -static inline struct page *shmem_swapin(swp_entry_t entry, gfp_t gfp,
- -                      struct shmem_inode_info *info, unsigned long idx)
+ +static inline struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+ +                      struct shmem_inode_info *info, pgoff_t index)
   {
- -      return swapin_readahead(entry, gfp, NULL, 0);
+ +      return swapin_readahead(swap, gfp, NULL, 0);
   }
   
   static inline struct page *shmem_alloc_page(gfp_t gfp,
- -                      struct shmem_inode_info *info, unsigned long idx)
+ +                      struct shmem_inode_info *info, pgoff_t index)
   {
         return alloc_page(gfp);
   }
@@@ -824,195 -1228,311 +824,195 @@@ static inline struct mempolicy *shmem_g
   #endif
   
   /*
- - * shmem_getpage - either get the page from swap or allocate a new one
+ + * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
    *
    * If we allocate a new one we do not mark it dirty. That's up to the
    * vm. If we swap it in we mark it dirty since we also free the swap
    * entry since a page cannot live in both the swap and page cache
    */
- -static int shmem_getpage(struct inode *inode, unsigned long idx,
- -                      struct page **pagep, enum sgp_type sgp, int *type)
+ +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
+ +      struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type)
   {
         struct address_space *mapping = inode->i_mapping;
- -      struct shmem_inode_info *info = SHMEM_I(inode);
+ +      struct shmem_inode_info *info;
         struct shmem_sb_info *sbinfo;
- -      struct page *filepage = *pagep;
- -      struct page *swappage;
- -      struct page *prealloc_page = NULL;
- -      swp_entry_t *entry;
+ +      struct page *page;
         swp_entry_t swap;
- -      gfp_t gfp;
         int error;
+ +      int once = 0;
   
- -      if (idx >= SHMEM_MAX_INDEX)
+ +      if (index > (MAX_LFS_FILESIZE >> PAGE_CACHE_SHIFT))
                 return -EFBIG;
+ +repeat:
+ +      swap.val = 0;
+ +      page = find_lock_page(mapping, index);
+ +      if (radix_tree_exceptional_entry(page)) {
+ +              swap = radix_to_swp_entry(page);
+ +              page = NULL;
+ +      }
   
- -      if (type)
- -              *type = 0;
+ +      if (sgp != SGP_WRITE &&
+ +          ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ +              error = -EINVAL;
+ +              goto failed;
+ +      }
   
- -      /*
- -       * Normally, filepage is NULL on entry, and either found
- -       * uptodate immediately, or allocated and zeroed, or read
- -       * in under swappage, which is then assigned to filepage.
- -       * But shmem_readpage (required for splice) passes in a locked
- -       * filepage, which may be found not uptodate by other callers
- -       * too, and may need to be copied from the swappage read in.
- -       */
- -repeat:
- -      if (!filepage)
- -              filepage = find_lock_page(mapping, idx);
- -      if (filepage && PageUptodate(filepage))
- -              goto done;
- -      gfp = mapping_gfp_mask(mapping);
- -      if (!filepage) {
+ +      if (page || (sgp == SGP_READ && !swap.val)) {
                 /*
- -               * Try to preload while we can wait, to not make a habit of
- -               * draining atomic reserves; but don't latch on to this cpu.
+ +               * Once we can get the page lock, it must be uptodate:
+ +               * if there were an error in reading back from swap,
+ +               * the page would not be inserted into the filecache.
                  */
- -              error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
- -              if (error)
- -                      goto failed;
- -              radix_tree_preload_end();
- -              if (sgp != SGP_READ && !prealloc_page) {
- -                      /* We don't care if this fails */
- -                      prealloc_page = shmem_alloc_page(gfp, info, idx);
- -                      if (prealloc_page) {
- -                              if (mem_cgroup_cache_charge(prealloc_page,
- -                                              current->mm, GFP_KERNEL)) {
- -                                      page_cache_release(prealloc_page);
- -                                      prealloc_page = NULL;
- -                              }
- -                      }
- -              }
+ +              BUG_ON(page && !PageUptodate(page));
+ +              *pagep = page;
+ +              return 0;
         }
- -      error = 0;
   
- -      spin_lock(&info->lock);
- -      shmem_recalc_inode(inode);
- -      entry = shmem_swp_alloc(info, idx, sgp);
- -      if (IS_ERR(entry)) {
- -              spin_unlock(&info->lock);
- -              error = PTR_ERR(entry);
- -              goto failed;
- -      }
- -      swap = *entry;
+ +      /*
+ +       * Fast cache lookup did not find it:
+ +       * bring it back from swap or allocate.
+ +       */
+ +      info = SHMEM_I(inode);
+ +      sbinfo = SHMEM_SB(inode->i_sb);
   
         if (swap.val) {
                 /* Look it up and read it in.. */
- -              swappage = lookup_swap_cache(swap);
- -              if (!swappage) {
- -                      shmem_swp_unmap(entry);
- -                      spin_unlock(&info->lock);
+ +              page = lookup_swap_cache(swap);
+ +              if (!page) {
                         /* here we actually do the io */
- -                      if (type)
- -                              *type |= VM_FAULT_MAJOR;
- -                      swappage = shmem_swapin(swap, gfp, info, idx);
- -                      if (!swappage) {
- -                              spin_lock(&info->lock);
- -                              entry = shmem_swp_alloc(info, idx, sgp);
- -                              if (IS_ERR(entry))
- -                                      error = PTR_ERR(entry);
- -                              else {
- -                                      if (entry->val == swap.val)
- -                                              error = -ENOMEM;
- -                                      shmem_swp_unmap(entry);
- -                              }
- -                              spin_unlock(&info->lock);
- -                              if (error)
- -                                      goto failed;
- -                              goto repeat;
+ +                      if (fault_type)
+ +                              *fault_type |= VM_FAULT_MAJOR;
+ +                      page = shmem_swapin(swap, gfp, info, index);
+ +                      if (!page) {
+ +                              error = -ENOMEM;
+ +                              goto failed;
                         }
- -                      wait_on_page_locked(swappage);
- -                      page_cache_release(swappage);
- -                      goto repeat;
                 }
   
                 /* We have to do this with page locked to prevent races */
- -              if (!trylock_page(swappage)) {
- -                      shmem_swp_unmap(entry);
- -                      spin_unlock(&info->lock);
- -                      wait_on_page_locked(swappage);
- -                      page_cache_release(swappage);
- -                      goto repeat;
- -              }
- -              if (PageWriteback(swappage)) {
- -                      shmem_swp_unmap(entry);
- -                      spin_unlock(&info->lock);
- -                      wait_on_page_writeback(swappage);
- -                      unlock_page(swappage);
- -                      page_cache_release(swappage);
- -                      goto repeat;
- -              }
- -              if (!PageUptodate(swappage)) {
- -                      shmem_swp_unmap(entry);
- -                      spin_unlock(&info->lock);
- -                      unlock_page(swappage);
- -                      page_cache_release(swappage);
+ +              lock_page(page);
+ +              if (!PageUptodate(page)) {
                         error = -EIO;
                         goto failed;
                 }
- -
- -              if (filepage) {
- -                      shmem_swp_set(info, entry, 0);
- -                      shmem_swp_unmap(entry);
- -                      delete_from_swap_cache(swappage);
- -                      spin_unlock(&info->lock);
- -                      copy_highpage(filepage, swappage);
- -                      unlock_page(swappage);
- -                      page_cache_release(swappage);
- -                      flush_dcache_page(filepage);
- -                      SetPageUptodate(filepage);
- -                      set_page_dirty(filepage);
- -                      swap_free(swap);
- -              } else if (!(error = add_to_page_cache_locked(swappage, mapping,
- -                                      idx, GFP_NOWAIT))) {
- -                      info->flags |= SHMEM_PAGEIN;
- -                      shmem_swp_set(info, entry, 0);
- -                      shmem_swp_unmap(entry);
- -                      delete_from_swap_cache(swappage);
- -                      spin_unlock(&info->lock);
- -                      filepage = swappage;
- -                      set_page_dirty(filepage);
- -                      swap_free(swap);
- -              } else {
- -                      shmem_swp_unmap(entry);
- -                      spin_unlock(&info->lock);
- -                      if (error == -ENOMEM) {
- -                              /*
- -                               * reclaim from proper memory cgroup and
- -                               * call memcg's OOM if needed.
- -                               */
- -                              error = mem_cgroup_shmem_charge_fallback(
- -                                                              swappage,
- -                                                              current->mm,
- -                                                              gfp);
- -                              if (error) {
- -                                      unlock_page(swappage);
- -                                      page_cache_release(swappage);
- -                                      goto failed;
- -                              }
- -                      }
- -                      unlock_page(swappage);
- -                      page_cache_release(swappage);
- -                      goto repeat;
- -              }
- -      } else if (sgp == SGP_READ && !filepage) {
- -              shmem_swp_unmap(entry);
- -              filepage = find_get_page(mapping, idx);
- -              if (filepage &&
- -                  (!PageUptodate(filepage) || !trylock_page(filepage))) {
- -                      spin_unlock(&info->lock);
- -                      wait_on_page_locked(filepage);
- -                      page_cache_release(filepage);
- -                      filepage = NULL;
- -                      goto repeat;
+ +              wait_on_page_writeback(page);
+ +
+ +              /* Someone may have already done it for us */
+ +              if (page->mapping) {
+ +                      if (page->mapping == mapping &&
+ +                          page->index == index)
+ +                              goto done;
+ +                      error = -EEXIST;
+ +                      goto failed;
                 }
+ +
+ +              error = mem_cgroup_cache_charge(page, current->mm,
+ +                                              gfp & GFP_RECLAIM_MASK);
+ +              if (!error)
+ +                      error = shmem_add_to_page_cache(page, mapping, index,
+ +                                              gfp, swp_to_radix_entry(swap));
+ +              if (error)
+ +                      goto failed;
+ +
+ +              spin_lock(&info->lock);
+ +              info->swapped--;
+ +              shmem_recalc_inode(inode);
                 spin_unlock(&info->lock);
+ +
+ +              delete_from_swap_cache(page);
+ +              set_page_dirty(page);
+ +              swap_free(swap);
+ +
         } else {
- -              shmem_swp_unmap(entry);
- -              sbinfo = SHMEM_SB(inode->i_sb);
+ +              if (shmem_acct_block(info->flags)) {
+ +                      error = -ENOSPC;
+ +                      goto failed;
+ +              }
                 if (sbinfo->max_blocks) {
                         if (percpu_counter_compare(&sbinfo->used_blocks,
- -                                              sbinfo->max_blocks) >= 0 ||
- -                          shmem_acct_block(info->flags))
- -                              goto nospace;
- -                      percpu_counter_inc(&sbinfo->used_blocks);
- -                      spin_lock(&inode->i_lock);
- -                      inode->i_blocks += BLOCKS_PER_PAGE;
- -                      spin_unlock(&inode->i_lock);
- -              } else if (shmem_acct_block(info->flags))
- -                      goto nospace;
- -
- -              if (!filepage) {
- -                      int ret;
- -
- -                      if (!prealloc_page) {
- -                              spin_unlock(&info->lock);
- -                              filepage = shmem_alloc_page(gfp, info, idx);
- -                              if (!filepage) {
- -                                      shmem_unacct_blocks(info->flags, 1);
- -                                      shmem_free_blocks(inode, 1);
- -                                      error = -ENOMEM;
- -                                      goto failed;
- -                              }
- -                              SetPageSwapBacked(filepage);
- -
- -                              /*
- -                               * Precharge page while we can wait, compensate
- -                               * after
- -                               */
- -                              error = mem_cgroup_cache_charge(filepage,
- -                                      current->mm, GFP_KERNEL);
- -                              if (error) {
- -                                      page_cache_release(filepage);
- -                                      shmem_unacct_blocks(info->flags, 1);
- -                                      shmem_free_blocks(inode, 1);
- -                                      filepage = NULL;
- -                                      goto failed;
- -                              }
- -
- -                              spin_lock(&info->lock);
- -                      } else {
- -                              filepage = prealloc_page;
- -                              prealloc_page = NULL;
- -                              SetPageSwapBacked(filepage);
+ +                                              sbinfo->max_blocks) >= 0) {
+ +                              error = -ENOSPC;
+ +                              goto unacct;
                         }
+ +                      percpu_counter_inc(&sbinfo->used_blocks);
+ +              }
   
- -                      entry = shmem_swp_alloc(info, idx, sgp);
- -                      if (IS_ERR(entry))
- -                              error = PTR_ERR(entry);
- -                      else {
- -                              swap = *entry;
- -                              shmem_swp_unmap(entry);
- -                      }
- -                      ret = error || swap.val;
- -                      if (ret)
- -                              mem_cgroup_uncharge_cache_page(filepage);
- -                      else
- -                              ret = add_to_page_cache_lru(filepage, mapping,
- -                                              idx, GFP_NOWAIT);
- -                      /*
- -                       * At add_to_page_cache_lru() failure, uncharge will
- -                       * be done automatically.
- -                       */
- -                      if (ret) {
- -                              spin_unlock(&info->lock);
- -                              page_cache_release(filepage);
- -                              shmem_unacct_blocks(info->flags, 1);
- -                              shmem_free_blocks(inode, 1);
- -                              filepage = NULL;
- -                              if (error)
- -                                      goto failed;
- -                              goto repeat;
- -                      }
- -                      info->flags |= SHMEM_PAGEIN;
+ +              page = shmem_alloc_page(gfp, info, index);
+ +              if (!page) {
+ +                      error = -ENOMEM;
+ +                      goto decused;
                 }
   
+ +              SetPageSwapBacked(page);
+ +              __set_page_locked(page);
+ +              error = mem_cgroup_cache_charge(page, current->mm,
+ +                                              gfp & GFP_RECLAIM_MASK);
+ +              if (!error)
+ +                      error = shmem_add_to_page_cache(page, mapping, index,
+ +                                              gfp, NULL);
+ +              if (error)
+ +                      goto decused;
+ +              lru_cache_add_anon(page);
+ +
+ +              spin_lock(&info->lock);
                 info->alloced++;
+ +              inode->i_blocks += BLOCKS_PER_PAGE;
+ +              shmem_recalc_inode(inode);
                 spin_unlock(&info->lock);
- -              clear_highpage(filepage);
- -              flush_dcache_page(filepage);
- -              SetPageUptodate(filepage);
+ +
+ +              clear_highpage(page);
+ +              flush_dcache_page(page);
+ +              SetPageUptodate(page);
                 if (sgp == SGP_DIRTY)
- -                      set_page_dirty(filepage);
+ +                      set_page_dirty(page);
         }
   done:
- -      *pagep = filepage;
- -      error = 0;
- -      goto out;
+ +      /* Perhaps the file has been truncated since we checked */
+ +      if (sgp != SGP_WRITE &&
+ +          ((loff_t)index << PAGE_CACHE_SHIFT) >= i_size_read(inode)) {
+ +              error = -EINVAL;
+ +              goto trunc;
+ +      }
+ +      *pagep = page;
+ +      return 0;
   
- -nospace:
         /*
- -       * Perhaps the page was brought in from swap between find_lock_page
- -       * and taking info->lock?  We allow for that at add_to_page_cache_lru,
- -       * but must also avoid reporting a spurious ENOSPC while working on a
- -       * full tmpfs.  (When filepage has been passed in to shmem_getpage, it
- -       * is already in page cache, which prevents this race from occurring.)
+ +       * Error recovery.
          */
- -      if (!filepage) {
- -              struct page *page = find_get_page(mapping, idx);
- -              if (page) {
- -                      spin_unlock(&info->lock);
- -                      page_cache_release(page);
- -                      goto repeat;
- -              }
- -      }
+ +trunc:
+ +      ClearPageDirty(page);
+ +      delete_from_page_cache(page);
+ +      spin_lock(&info->lock);
+ +      info->alloced--;
+ +      inode->i_blocks -= BLOCKS_PER_PAGE;
         spin_unlock(&info->lock);
- -      error = -ENOSPC;
+ +decused:
+ +      if (sbinfo->max_blocks)
+ +              percpu_counter_add(&sbinfo->used_blocks, -1);
+ +unacct:
+ +      shmem_unacct_blocks(info->flags, 1);
   failed:
- -      if (*pagep != filepage) {
- -              unlock_page(filepage);
- -              page_cache_release(filepage);
+ +      if (swap.val && error != -EINVAL) {
+ +              struct page *test = find_get_page(mapping, index);
+ +              if (test && !radix_tree_exceptional_entry(test))
+ +                      page_cache_release(test);
+ +              /* Have another try if the entry has changed */
+ +              if (test != swp_to_radix_entry(swap))
+ +                      error = -EEXIST;
         }
- -out:
- -      if (prealloc_page) {
- -              mem_cgroup_uncharge_cache_page(prealloc_page);
- -              page_cache_release(prealloc_page);
+ +      if (page) {
+ +              unlock_page(page);
+ +              page_cache_release(page);
+ +      }
+ +      if (error == -ENOSPC && !once++) {
+ +              info = SHMEM_I(inode);
+ +              spin_lock(&info->lock);
+ +              shmem_recalc_inode(inode);
+ +              spin_unlock(&info->lock);
+ +              goto repeat;
         }
+ +      if (error == -EEXIST)
+ +              goto repeat;
         return error;
   }
   
@@@ -1020,34 -1540,36 +1020,34 @@@ static int shmem_fault(struct vm_area_s
   {
         struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
         int error;
- -      int ret;
- -
- -      if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- -              return VM_FAULT_SIGBUS;
+ +      int ret = VM_FAULT_LOCKED;
   
         error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
         if (error)
                 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
+ +
         if (ret & VM_FAULT_MAJOR) {
                 count_vm_event(PGMAJFAULT);
                 mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
         }
- -      return ret | VM_FAULT_LOCKED;
+ +      return ret;
   }
   
   #ifdef CONFIG_NUMA
- -static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *new)
+ +static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol)
   {
- -      struct inode *i = vma->vm_file->f_path.dentry->d_inode;
- -      return mpol_set_shared_policy(&SHMEM_I(i)->policy, vma, new);
+ +      struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ +      return mpol_set_shared_policy(&SHMEM_I(inode)->policy, vma, mpol);
   }
   
   static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
                                           unsigned long addr)
   {
- -      struct inode *i = vma->vm_file->f_path.dentry->d_inode;
- -      unsigned long idx;
+ +      struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+ +      pgoff_t index;
   
- -      idx = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- -      return mpol_shared_policy_lookup(&SHMEM_I(i)->policy, idx);
+ +      index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ +      return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index);
   }
   #endif
   
@@@ -1145,7 -1667,20 +1145,7 @@@ static struct inode *shmem_get_inode(st
   
   #ifdef CONFIG_TMPFS
   static const struct inode_operations shmem_symlink_inode_operations;
- -static const struct inode_operations shmem_symlink_inline_operations;
- -
- -/*
- - * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin;
- - * but providing them allows a tmpfs file to be used for splice, sendfile, and
- - * below the loop driver, in the generic fashion that many filesystems support.
- - */
- -static int shmem_readpage(struct file *file, struct page *page)
- -{
- -      struct inode *inode = page->mapping->host;
- -      int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL);
- -      unlock_page(page);
- -      return error;
- -}
+ +static const struct inode_operations shmem_short_symlink_operations;
   
   static int
   shmem_write_begin(struct file *file, struct address_space *mapping,
@@@ -1154,6 -1689,7 +1154,6 @@@
   {
         struct inode *inode = mapping->host;
         pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- -      *pagep = NULL;
         return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL);
   }
   
@@@ -1178,8 -1714,7 +1178,8 @@@ static void do_shmem_file_read(struct f
   {
         struct inode *inode = filp->f_path.dentry->d_inode;
         struct address_space *mapping = inode->i_mapping;
- -      unsigned long index, offset;
+ +      pgoff_t index;
+ +      unsigned long offset;
         enum sgp_type sgp = SGP_READ;
   
         /*
@@@ -1195,8 -1730,7 +1195,8 @@@
   
         for (;;) {
                 struct page *page = NULL;
- -              unsigned long end_index, nr, ret;
+ +              pgoff_t end_index;
+ +              unsigned long nr, ret;
                 loff_t i_size = i_size_read(inode);
   
                 end_index = i_size >> PAGE_CACHE_SHIFT;
@@@ -1312,119 -1846,6 +1312,119 @@@ static ssize_t shmem_file_aio_read(stru
         return retval;
   }
   
+ +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
+ +                              struct pipe_inode_info *pipe, size_t len,
+ +                              unsigned int flags)
+ +{
+ +      struct address_space *mapping = in->f_mapping;
+ +      struct inode *inode = mapping->host;
+ +      unsigned int loff, nr_pages, req_pages;
+ +      struct page *pages[PIPE_DEF_BUFFERS];
+ +      struct partial_page partial[PIPE_DEF_BUFFERS];
+ +      struct page *page;
+ +      pgoff_t index, end_index;
+ +      loff_t isize, left;
+ +      int error, page_nr;
+ +      struct splice_pipe_desc spd = {
+ +              .pages = pages,
+ +              .partial = partial,
+ +              .flags = flags,
+ +              .ops = &page_cache_pipe_buf_ops,
+ +              .spd_release = spd_release_page,
+ +      };
+ +
+ +      isize = i_size_read(inode);
+ +      if (unlikely(*ppos >= isize))
+ +              return 0;
+ +
+ +      left = isize - *ppos;
+ +      if (unlikely(left < len))
+ +              len = left;
+ +
+ +      if (splice_grow_spd(pipe, &spd))
+ +              return -ENOMEM;
+ +
+ +      index = *ppos >> PAGE_CACHE_SHIFT;
+ +      loff = *ppos & ~PAGE_CACHE_MASK;
+ +      req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+ +      nr_pages = min(req_pages, pipe->buffers);
+ +
+ +      spd.nr_pages = find_get_pages_contig(mapping, index,
+ +                                              nr_pages, spd.pages);
+ +      index += spd.nr_pages;
+ +      error = 0;
+ +
+ +      while (spd.nr_pages < nr_pages) {
+ +              error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL);
+ +              if (error)
+ +                      break;
+ +              unlock_page(page);
+ +              spd.pages[spd.nr_pages++] = page;
+ +              index++;
+ +      }
+ +
+ +      index = *ppos >> PAGE_CACHE_SHIFT;
+ +      nr_pages = spd.nr_pages;
+ +      spd.nr_pages = 0;
+ +
+ +      for (page_nr = 0; page_nr < nr_pages; page_nr++) {
+ +              unsigned int this_len;
+ +
+ +              if (!len)
+ +                      break;
+ +
+ +              this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff);
+ +              page = spd.pages[page_nr];
+ +
+ +              if (!PageUptodate(page) || page->mapping != mapping) {
+ +                      error = shmem_getpage(inode, index, &page,
+ +                                                      SGP_CACHE, NULL);
+ +                      if (error)
+ +                              break;
+ +                      unlock_page(page);
+ +                      page_cache_release(spd.pages[page_nr]);
+ +                      spd.pages[page_nr] = page;
+ +              }
+ +
+ +              isize = i_size_read(inode);
+ +              end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+ +              if (unlikely(!isize || index > end_index))
+ +                      break;
+ +
+ +              if (end_index == index) {
+ +                      unsigned int plen;
+ +
+ +                      plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+ +                      if (plen <= loff)
+ +                              break;
+ +
+ +                      this_len = min(this_len, plen - loff);
+ +                      len = this_len;
+ +              }
+ +
+ +              spd.partial[page_nr].offset = loff;
+ +              spd.partial[page_nr].len = this_len;
+ +              len -= this_len;
+ +              loff = 0;
+ +              spd.nr_pages++;
+ +              index++;
+ +      }
+ +
+ +      while (page_nr < nr_pages)
+ +              page_cache_release(spd.pages[page_nr++]);
+ +
+ +      if (spd.nr_pages)
+ +              error = splice_to_pipe(pipe, &spd);
+ +
+ +      splice_shrink_spd(pipe, &spd);
+ +
+ +      if (error > 0) {
+ +              *ppos += error;
+ +              file_accessed(in);
+ +      }
+ +      return error;
+ +}
+ +
   static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
   {
         struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb);
@@@ -1434,9 -1855,8 +1434,9 @@@
         buf->f_namelen = NAME_MAX;
         if (sbinfo->max_blocks) {
                 buf->f_blocks = sbinfo->max_blocks;
- -              buf->f_bavail = buf->f_bfree =
- -                              sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks);
+ +              buf->f_bavail =
+ +              buf->f_bfree  = sbinfo->max_blocks -
+ +                              percpu_counter_sum(&sbinfo->used_blocks);
         }
         if (sbinfo->max_inodes) {
                 buf->f_files = sbinfo->max_inodes;
@@@ -1458,7 -1878,7 +1458,7 @@@ shmem_mknod(struct inode *dir, struct d
         inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
         if (inode) {
                 error = security_inode_init_security(inode, dir,
-                                                    &dentry->d_name, NULL,
+                                                    &dentry->d_name,
                                                      NULL, NULL);
                 if (error) {
                         if (error != -EOPNOTSUPP) {
@@@ -1586,7 -2006,7 +1586,7 @@@ static int shmem_symlink(struct inode *
         int error;
         int len;
         struct inode *inode;
- -      struct page *page = NULL;
+ +      struct page *page;
         char *kaddr;
         struct shmem_inode_info *info;
   
@@@ -1598,7 -2018,7 +1598,7 @@@
         if (!inode)
                 return -ENOSPC;
   
-       error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+       error = security_inode_init_security(inode, dir, &dentry->d_name,
                                              NULL, NULL);
         if (error) {
                 if (error != -EOPNOTSUPP) {
@@@ -1610,13 -2030,10 +1610,13 @@@
   
         info = SHMEM_I(inode);
         inode->i_size = len-1;
- -      if (len <= SHMEM_SYMLINK_INLINE_LEN) {
- -              /* do it inline */
- -              memcpy(info->inline_symlink, symname, len);
- -              inode->i_op = &shmem_symlink_inline_operations;
+ +      if (len <= SHORT_SYMLINK_LEN) {
+ +              info->symlink = kmemdup(symname, len, GFP_KERNEL);
+ +              if (!info->symlink) {
+ +                      iput(inode);
+ +                      return -ENOMEM;
+ +              }
+ +              inode->i_op = &shmem_short_symlink_operations;
         } else {
                 error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL);
                 if (error) {
@@@ -1639,17 -2056,17 +1639,17 @@@
         return 0;
   }
   
- -static void *shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
+ +static void *shmem_follow_short_symlink(struct dentry *dentry, struct nameidata *nd)
   {
- -      nd_set_link(nd, SHMEM_I(dentry->d_inode)->inline_symlink);
+ +      nd_set_link(nd, SHMEM_I(dentry->d_inode)->symlink);
         return NULL;
   }
   
   static void *shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
   {
         struct page *page = NULL;
- -      int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
- -      nd_set_link(nd, res ? ERR_PTR(res) : kmap(page));
+ +      int error = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ, NULL);
+ +      nd_set_link(nd, error ? ERR_PTR(error) : kmap(page));
         if (page)
                 unlock_page(page);
         return page;
@@@ -1760,6 -2177,7 +1760,6 @@@ out
         return err;
   }
   
- -
   static const struct xattr_handler *shmem_xattr_handlers[] = {
   #ifdef CONFIG_TMPFS_POSIX_ACL
         &generic_acl_access_handler,
@@@ -1889,9 -2307,9 +1889,9 @@@ static ssize_t shmem_listxattr(struct d
   }
   #endif /* CONFIG_TMPFS_XATTR */
   
- -static const struct inode_operations shmem_symlink_inline_operations = {
+ +static const struct inode_operations shmem_short_symlink_operations = {
         .readlink       = generic_readlink,
- -      .follow_link    = shmem_follow_link_inline,
+ +      .follow_link    = shmem_follow_short_symlink,
   #ifdef CONFIG_TMPFS_XATTR
         .setxattr       = shmem_setxattr,
         .getxattr       = shmem_getxattr,
@@@ -2091,7 -2509,8 +2091,7 @@@ static int shmem_remount_fs(struct supe
         if (config.max_inodes < inodes)
                 goto out;
         /*
- -       * Those tests also disallow limited->unlimited while any are in
- -       * use, so i_blocks will always be zero when max_blocks is zero;
+ +       * Those tests disallow limited->unlimited while any are in use;
          * but we must separately disallow unlimited->limited, because
          * in that case we have no record of how much is already in use.
          */
@@@ -2183,7 -2602,7 +2183,7 @@@ int shmem_fill_super(struct super_bloc
                 goto failed;
         sbinfo->free_inodes = sbinfo->max_inodes;
   
- -      sb->s_maxbytes = SHMEM_MAX_BYTES;
+ +      sb->s_maxbytes = MAX_LFS_FILESIZE;
         sb->s_blocksize = PAGE_CACHE_SIZE;
         sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
         sb->s_magic = TMPFS_MAGIC;
@@@ -2218,14 -2637,14 +2218,14 @@@ static struct kmem_cache *shmem_inode_c
   
   static struct inode *shmem_alloc_inode(struct super_block *sb)
   {
- -      struct shmem_inode_info *p;
- -      p = (struct shmem_inode_info *)kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
- -      if (!p)
+ +      struct shmem_inode_info *info;
+ +      info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ +      if (!info)
                 return NULL;
- -      return &p->vfs_inode;
+ +      return &info->vfs_inode;
   }
   
- -static void shmem_i_callback(struct rcu_head *head)
+ +static void shmem_destroy_callback(struct rcu_head *head)
   {
         struct inode *inode = container_of(head, struct inode, i_rcu);
         INIT_LIST_HEAD(&inode->i_dentry);
@@@ -2234,26 -2653,29 +2234,26 @@@
   
   static void shmem_destroy_inode(struct inode *inode)
   {
- -      if ((inode->i_mode & S_IFMT) == S_IFREG) {
- -              /* only struct inode is valid if it's an inline symlink */
+ +      if ((inode->i_mode & S_IFMT) == S_IFREG)
                 mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- -      }
- -      call_rcu(&inode->i_rcu, shmem_i_callback);
+ +      call_rcu(&inode->i_rcu, shmem_destroy_callback);
   }
   
- -static void init_once(void *foo)
+ +static void shmem_init_inode(void *foo)
   {
- -      struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
- -
- -      inode_init_once(&p->vfs_inode);
+ +      struct shmem_inode_info *info = foo;
+ +      inode_init_once(&info->vfs_inode);
   }
   
- -static int init_inodecache(void)
+ +static int shmem_init_inodecache(void)
   {
         shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
                                 sizeof(struct shmem_inode_info),
- -                              0, SLAB_PANIC, init_once);
+ +                              0, SLAB_PANIC, shmem_init_inode);
         return 0;
   }
   
- -static void destroy_inodecache(void)
+ +static void shmem_destroy_inodecache(void)
   {
         kmem_cache_destroy(shmem_inode_cachep);
   }
@@@ -2262,6 -2684,7 +2262,6 @@@ static const struct address_space_opera
         .writepage      = shmem_writepage,
         .set_page_dirty = __set_page_dirty_no_writeback,
   #ifdef CONFIG_TMPFS
- -      .readpage       = shmem_readpage,
         .write_begin    = shmem_write_begin,
         .write_end      = shmem_write_end,
   #endif
@@@ -2278,7 -2701,7 +2278,7 @@@ static const struct file_operations shm
         .aio_read       = shmem_file_aio_read,
         .aio_write      = generic_file_aio_write,
         .fsync          = noop_fsync,
- -      .splice_read    = generic_file_splice_read,
+ +      .splice_read    = shmem_file_splice_read,
         .splice_write   = generic_file_splice_write,
   #endif
   };
@@@ -2292,6 -2715,10 +2292,6 @@@ static const struct inode_operations sh
         .listxattr      = shmem_listxattr,
         .removexattr    = shmem_removexattr,
   #endif
- -#ifdef CONFIG_TMPFS_POSIX_ACL
- -      .check_acl      = generic_check_acl,
- -#endif
- -
   };
   
   static const struct inode_operations shmem_dir_inode_operations = {
@@@ -2314,6 -2741,7 +2314,6 @@@
   #endif
   #ifdef CONFIG_TMPFS_POSIX_ACL
         .setattr        = shmem_setattr,
- -      .check_acl      = generic_check_acl,
   #endif
   };
   
@@@ -2326,6 -2754,7 +2326,6 @@@ static const struct inode_operations sh
   #endif
   #ifdef CONFIG_TMPFS_POSIX_ACL
         .setattr        = shmem_setattr,
- -      .check_acl      = generic_check_acl,
   #endif
   };
   
@@@ -2350,20 -2779,21 +2350,20 @@@ static const struct vm_operations_struc
   #endif
   };
   
- -
   static struct dentry *shmem_mount(struct file_system_type *fs_type,
         int flags, const char *dev_name, void *data)
   {
         return mount_nodev(fs_type, flags, data, shmem_fill_super);
   }
   
- -static struct file_system_type tmpfs_fs_type = {
+ +static struct file_system_type shmem_fs_type = {
         .owner          = THIS_MODULE,
         .name           = "tmpfs",
         .mount          = shmem_mount,
         .kill_sb        = kill_litter_super,
   };
   
- -int __init init_tmpfs(void)
+ +int __init shmem_init(void)
   {
         int error;
   
@@@ -2371,18 -2801,18 +2371,18 @@@
         if (error)
                 goto out4;
   
- -      error = init_inodecache();
+ +      error = shmem_init_inodecache();
         if (error)
                 goto out3;
   
- -      error = register_filesystem(&tmpfs_fs_type);
+ +      error = register_filesystem(&shmem_fs_type);
         if (error) {
                 printk(KERN_ERR "Could not register tmpfs\n");
                 goto out2;
         }
   
- -      shm_mnt = vfs_kern_mount(&tmpfs_fs_type, MS_NOUSER,
- -                              tmpfs_fs_type.name, NULL);
+ +      shm_mnt = vfs_kern_mount(&shmem_fs_type, MS_NOUSER,
+ +                               shmem_fs_type.name, NULL);
         if (IS_ERR(shm_mnt)) {
                 error = PTR_ERR(shm_mnt);
                 printk(KERN_ERR "Could not kern_mount tmpfs\n");
@@@ -2391,9 -2821,9 +2391,9 @@@
         return 0;
   
   out1:
- -      unregister_filesystem(&tmpfs_fs_type);
+ +      unregister_filesystem(&shmem_fs_type);
   out2:
- -      destroy_inodecache();
+ +      shmem_destroy_inodecache();
   out3:
         bdi_destroy(&shmem_backing_dev_info);
   out4:
@@@ -2401,6 -2831,45 +2401,6 @@@
         return error;
   }
   
- -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
- -/**
- - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- - * @inode: the inode to be searched
- - * @pgoff: the offset to be searched
- - * @pagep: the pointer for the found page to be stored
- - * @ent: the pointer for the found swap entry to be stored
- - *
- - * If a page is found, refcount of it is incremented. Callers should handle
- - * these refcount.
- - */
- -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
- -                                      struct page **pagep, swp_entry_t *ent)
- -{
- -      swp_entry_t entry = { .val = 0 }, *ptr;
- -      struct page *page = NULL;
- -      struct shmem_inode_info *info = SHMEM_I(inode);
- -
- -      if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- -              goto out;
- -
- -      spin_lock(&info->lock);
- -      ptr = shmem_swp_entry(info, pgoff, NULL);
- -#ifdef CONFIG_SWAP
- -      if (ptr && ptr->val) {
- -              entry.val = ptr->val;
- -              page = find_get_page(&swapper_space, entry.val);
- -      } else
- -#endif
- -              page = find_get_page(inode->i_mapping, pgoff);
- -      if (ptr)
- -              shmem_swp_unmap(ptr);
- -      spin_unlock(&info->lock);
- -out:
- -      *pagep = page;
- -      *ent = entry;
- -}
- -#endif
- -
   #else /* !CONFIG_SHMEM */
   
   /*
@@@ -2414,23 -2883,23 +2414,23 @@@
   
   #include <linux/ramfs.h>
   
- -static struct file_system_type tmpfs_fs_type = {
+ +static struct file_system_type shmem_fs_type = {
         .name           = "tmpfs",
         .mount          = ramfs_mount,
         .kill_sb        = kill_litter_super,
   };
   
- -int __init init_tmpfs(void)
+ +int __init shmem_init(void)
   {
- -      BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
+ +      BUG_ON(register_filesystem(&shmem_fs_type) != 0);
   
- -      shm_mnt = kern_mount(&tmpfs_fs_type);
+ +      shm_mnt = kern_mount(&shmem_fs_type);
         BUG_ON(IS_ERR(shm_mnt));
   
         return 0;
   }
   
- -int shmem_unuse(swp_entry_t entry, struct page *page)
+ +int shmem_unuse(swp_entry_t swap, struct page *page)
   {
         return 0;
   }
@@@ -2440,17 -2909,43 +2440,17 @@@ int shmem_lock(struct file *file, int l
         return 0;
   }
   
- -void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end)
+ +void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
   {
- -      truncate_inode_pages_range(inode->i_mapping, start, end);
+ +      truncate_inode_pages_range(inode->i_mapping, lstart, lend);
   }
   EXPORT_SYMBOL_GPL(shmem_truncate_range);
   
- -#ifdef CONFIG_CGROUP_MEM_RES_CTLR
- -/**
- - * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file
- - * @inode: the inode to be searched
- - * @pgoff: the offset to be searched
- - * @pagep: the pointer for the found page to be stored
- - * @ent: the pointer for the found swap entry to be stored
- - *
- - * If a page is found, refcount of it is incremented. Callers should handle
- - * these refcount.
- - */
- -void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff,
- -                                      struct page **pagep, swp_entry_t *ent)
- -{
- -      struct page *page = NULL;
- -
- -      if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode))
- -              goto out;
- -      page = find_get_page(inode->i_mapping, pgoff);
- -out:
- -      *pagep = page;
- -      *ent = (swp_entry_t){ .val = 0 };
- -}
- -#endif
- -
   #define shmem_vm_ops                          generic_file_vm_ops
   #define shmem_file_operations                 ramfs_file_operations
   #define shmem_get_inode(sb, dir, mode, dev, flags)    ramfs_get_inode(sb, dir, mode, dev)
   #define shmem_acct_size(flags, size)          0
   #define shmem_unacct_size(flags, size)                do {} while (0)
- -#define SHMEM_MAX_BYTES                               MAX_LFS_FILESIZE
   
   #endif /* CONFIG_SHMEM */
   
@@@ -2474,7 -2969,7 +2474,7 @@@ struct file *shmem_file_setup(const cha
         if (IS_ERR(shm_mnt))
                 return (void *)shm_mnt;
   
- -      if (size < 0 || size > SHMEM_MAX_BYTES)
+ +      if (size < 0 || size > MAX_LFS_FILESIZE)
                 return ERR_PTR(-EINVAL);
   
         if (shmem_acct_size(flags, size))
@@@ -2553,29 -3048,13 +2553,29 @@@ int shmem_zero_setup(struct vm_area_str
    * suit tmpfs, since it may have pages in swapcache, and needs to find those
    * for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
    *
- - * Provide a stub for those callers to start using now, then later
- - * flesh it out to call shmem_getpage() with additional gfp mask, when
- - * shmem_file_splice_read() is added and shmem_readpage() is removed.
+ + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in
+ + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily.
    */
   struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
                                          pgoff_t index, gfp_t gfp)
   {
+ +#ifdef CONFIG_SHMEM
+ +      struct inode *inode = mapping->host;
+ +      struct page *page;
+ +      int error;
+ +
+ +      BUG_ON(mapping->a_ops != &shmem_aops);
+ +      error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL);
+ +      if (error)
+ +              page = ERR_PTR(error);
+ +      else
+ +              unlock_page(page);
+ +      return page;
+ +#else
+ +      /*
+ +       * The tiny !SHMEM case uses ramfs without swap
+ +       */
         return read_cache_page_gfp(mapping, index, gfp);
+ +#endif
   }
   EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp);
diff --combined security/integrity/ima/ima_main.c

index 26b46ff7466353bcf8fbe58545bd8ab6c2f0e384,25f9fe76289623fd81cf0aba99b7a78729a05a02..42dc27007fdda90801b104207444b8e3e5e41351
--- 1/security/integrity/ima/ima_main.c
--- 2/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@@ -82,11 -82,11 +82,11 @@@ out
                                   "open_writers");
   }
   
- static void ima_check_last_writer(struct ima_iint_cache *iint,
+ static void ima_check_last_writer(struct integrity_iint_cache *iint,
                                   struct inode *inode,
                                   struct file *file)
   {
- -      mode_t mode = file->f_mode;
+ +      fmode_t mode = file->f_mode;
   
         mutex_lock(&iint->mutex);
         if (mode & FMODE_WRITE &&
@@@ -105,12 -105,12 +105,12 @@@
   void ima_file_free(struct file *file)
   {
         struct inode *inode = file->f_dentry->d_inode;
-       struct ima_iint_cache *iint;
+       struct integrity_iint_cache *iint;
   
         if (!iint_initialized || !S_ISREG(inode->i_mode))
                 return;
   
-       iint = ima_iint_find(inode);
+       iint = integrity_iint_find(inode);
         if (!iint)
                 return;
   
@@@ -121,7 -121,7 +121,7 @@@ static int process_measurement(struct f
                                int mask, int function)
   {
         struct inode *inode = file->f_dentry->d_inode;
-       struct ima_iint_cache *iint;
+       struct integrity_iint_cache *iint;
         int rc = 0;
   
         if (!ima_initialized || !S_ISREG(inode->i_mode))
@@@ -131,9 -131,9 +131,9 @@@
         if (rc != 0)
                 return rc;
   retry:
-       iint = ima_iint_find(inode);
+       iint = integrity_iint_find(inode);
         if (!iint) {
-               rc = ima_inode_alloc(inode);
+               rc = integrity_inode_alloc(inode);
                 if (!rc || rc == -EEXIST)
                         goto retry;
                 return rc;
diff --combined security/security.c

index 0e4fccfef12cb495fcf059a4dc964e3b45ee136c,19251ccb2de0d09f2ed34a3e6018ffb80eb01cb2..a6328421a0551bef7439247e3cda3588fcd30d20
--- 1/security/security.c
--- 2/security/security.c
+++ b/security/security.c
@@@ -16,7 -16,11 +16,11 @@@
   #include <linux/init.h>
   #include <linux/kernel.h>
   #include <linux/security.h>
+ #include <linux/integrity.h>
   #include <linux/ima.h>
+ #include <linux/evm.h>
+ 
+ #define MAX_LSM_EVM_XATTR     2
   
   /* Boot-time LSM user choice */
   static __initdata char chosen_lsm[SECURITY_NAME_MAX + 1] =
@@@ -334,20 -338,57 +338,57 @@@ int security_inode_alloc(struct inode *
   
   void security_inode_free(struct inode *inode)
   {
-       ima_inode_free(inode);
+       integrity_inode_free(inode);
         security_ops->inode_free_security(inode);
   }
   
   int security_inode_init_security(struct inode *inode, struct inode *dir,
-                                const struct qstr *qstr, char **name,
-                                void **value, size_t *len)
+                                const struct qstr *qstr,
+                                const initxattrs initxattrs, void *fs_data)
+ {
+       struct xattr new_xattrs[MAX_LSM_EVM_XATTR + 1];
+       struct xattr *lsm_xattr, *evm_xattr, *xattr;
+       int ret;
+ 
+       if (unlikely(IS_PRIVATE(inode)))
+               return -EOPNOTSUPP;
+ 
+       memset(new_xattrs, 0, sizeof new_xattrs);
+       if (!initxattrs)
+               return security_ops->inode_init_security(inode, dir, qstr,
+                                                        NULL, NULL, NULL);
+       lsm_xattr = new_xattrs;
+       ret = security_ops->inode_init_security(inode, dir, qstr,
+                                               &lsm_xattr->name,
+                                               &lsm_xattr->value,
+                                               &lsm_xattr->value_len);
+       if (ret)
+               goto out;
+ 
+       evm_xattr = lsm_xattr + 1;
+       ret = evm_inode_init_security(inode, lsm_xattr, evm_xattr);
+       if (ret)
+               goto out;
+       ret = initxattrs(inode, new_xattrs, fs_data);
+ out:
+       for (xattr = new_xattrs; xattr->name != NULL; xattr++) {
+               kfree(xattr->name);
+               kfree(xattr->value);
+       }
+       return (ret == -EOPNOTSUPP) ? 0 : ret;
+ }
+ EXPORT_SYMBOL(security_inode_init_security);
+ 
+ int security_old_inode_init_security(struct inode *inode, struct inode *dir,
+                                    const struct qstr *qstr, char **name,
+                                    void **value, size_t *len)
   {
         if (unlikely(IS_PRIVATE(inode)))
                 return -EOPNOTSUPP;
         return security_ops->inode_init_security(inode, dir, qstr, name, value,
                                                  len);
   }
- EXPORT_SYMBOL(security_inode_init_security);
+ EXPORT_SYMBOL(security_old_inode_init_security);
   
   #ifdef CONFIG_SECURITY_PATH
   int security_path_mknod(struct path *dir, struct dentry *dentry, int mode,
@@@ -518,14 -559,26 +559,19 @@@ int security_inode_permission(struct in
   {
         if (unlikely(IS_PRIVATE(inode)))
                 return 0;
- -      return security_ops->inode_permission(inode, mask, 0);
- -}
- -
- -int security_inode_exec_permission(struct inode *inode, unsigned int flags)
- -{
- -      if (unlikely(IS_PRIVATE(inode)))
- -              return 0;
- -      return security_ops->inode_permission(inode, MAY_EXEC, flags);
+ +      return security_ops->inode_permission(inode, mask);
   }
   
   int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
   {
+       int ret;
+ 
         if (unlikely(IS_PRIVATE(dentry->d_inode)))
                 return 0;
-       return security_ops->inode_setattr(dentry, attr);
+       ret = security_ops->inode_setattr(dentry, attr);
+       if (ret)
+               return ret;
+       return evm_inode_setattr(dentry, attr);
   }
   EXPORT_SYMBOL_GPL(security_inode_setattr);
   
@@@ -539,9 -592,14 +585,14 @@@ int security_inode_getattr(struct vfsmo
   int security_inode_setxattr(struct dentry *dentry, const char *name,
                             const void *value, size_t size, int flags)
   {
+       int ret;
+ 
         if (unlikely(IS_PRIVATE(dentry->d_inode)))
                 return 0;
-       return security_ops->inode_setxattr(dentry, name, value, size, flags);
+       ret = security_ops->inode_setxattr(dentry, name, value, size, flags);
+       if (ret)
+               return ret;
+       return evm_inode_setxattr(dentry, name, value, size);
   }
   
   void security_inode_post_setxattr(struct dentry *dentry, const char *name,
@@@ -550,6 -608,7 +601,7 @@@
         if (unlikely(IS_PRIVATE(dentry->d_inode)))
                 return;
         security_ops->inode_post_setxattr(dentry, name, value, size, flags);
+       evm_inode_post_setxattr(dentry, name, value, size);
   }
   
   int security_inode_getxattr(struct dentry *dentry, const char *name)
@@@ -568,9 -627,14 +620,14 @@@ int security_inode_listxattr(struct den
   
   int security_inode_removexattr(struct dentry *dentry, const char *name)
   {
+       int ret;
+ 
         if (unlikely(IS_PRIVATE(dentry->d_inode)))
                 return 0;
-       return security_ops->inode_removexattr(dentry, name);
+       ret = security_ops->inode_removexattr(dentry, name);
+       if (ret)
+               return ret;
+       return evm_inode_removexattr(dentry, name);
   }
   
   int security_inode_need_killpriv(struct dentry *dentry)
author	James Morris <jmorris@namei.org>
	Tue, 9 Aug 2011 00:31:03 +0000 (10:31 +1000)
committer	James Morris <jmorris@namei.org>
	Tue, 9 Aug 2011 00:31:03 +0000 (10:31 +1000)
		1	2
Documentation/kernel-parameters.txt	patch \|	diff1 \|	diff2 \|	blob \| history
fs/attr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/btrfs/xattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/gfs2/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/jfs/xattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/linux-2.6/xfs_iops.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/security.h	patch \|	diff1 \|	diff2 \|	blob \| history
mm/shmem.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/integrity/ima/ima_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
security/security.c	patch \|	diff1 \|	diff2 \|	blob \| history