xfs: verify inode buffers as they are read from disk

[firefly-linux-kernel-4.4.55.git] / fs / xfs / xfs_inode.c
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 2778258fcfa239e07dbc79edf6cb4750b7e13dd8..514eac913f1cc0c3b9759e488f80759a06a7a416 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
  #include "xfs_filestream.h"
  #include "xfs_vnodeops.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  kmem_zone_t *xfs_ifork_zone;
  kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
         return 0;
  }
  
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code.  It is used in places that wish to lock the inode solely
+ * for reading the extents.  The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in.  Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though.  What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+       xfs_inode_t     *ip)
+{
+       uint    lock_mode;
+
+       if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+           ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+               lock_mode = XFS_ILOCK_EXCL;
+       } else {
+               lock_mode = XFS_ILOCK_SHARED;
+       }
+
+       xfs_ilock(ip, lock_mode);
+
+       return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+       xfs_inode_t     *ip,
+       unsigned int    lock_mode)
+{
+       xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *             XFS_IOLOCK_SHARED,
+ *             XFS_IOLOCK_EXCL,
+ *             XFS_ILOCK_SHARED,
+ *             XFS_ILOCK_EXCL,
+ *             XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *             XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *             XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *             XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+       /*
+        * You can't set both SHARED and EXCL for the same lock,
+        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+       else if (lock_flags & XFS_IOLOCK_SHARED)
+               mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+       if (lock_flags & XFS_ILOCK_EXCL)
+               mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+       else if (lock_flags & XFS_ILOCK_SHARED)
+               mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *      of valid values.
+ */
+int
+xfs_ilock_nowait(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+       /*
+        * You can't set both SHARED and EXCL for the same lock,
+        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+       if (lock_flags & XFS_IOLOCK_EXCL) {
+               if (!mrtryupdate(&ip->i_iolock))
+                       goto out;
+       } else if (lock_flags & XFS_IOLOCK_SHARED) {
+               if (!mrtryaccess(&ip->i_iolock))
+                       goto out;
+       }
+       if (lock_flags & XFS_ILOCK_EXCL) {
+               if (!mrtryupdate(&ip->i_lock))
+                       goto out_undo_iolock;
+       } else if (lock_flags & XFS_ILOCK_SHARED) {
+               if (!mrtryaccess(&ip->i_lock))
+                       goto out_undo_iolock;
+       }
+       return 1;
+
+ out_undo_iolock:
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrunlock_excl(&ip->i_iolock);
+       else if (lock_flags & XFS_IOLOCK_SHARED)
+               mrunlock_shared(&ip->i_iolock);
+ out:
+       return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *      of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       /*
+        * You can't set both SHARED and EXCL for the same lock,
+        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+       ASSERT(lock_flags != 0);
+
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrunlock_excl(&ip->i_iolock);
+       else if (lock_flags & XFS_IOLOCK_SHARED)
+               mrunlock_shared(&ip->i_iolock);
+
+       if (lock_flags & XFS_ILOCK_EXCL)
+               mrunlock_excl(&ip->i_lock);
+       else if (lock_flags & XFS_ILOCK_SHARED)
+               mrunlock_shared(&ip->i_lock);
+
+       trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+       if (lock_flags & XFS_ILOCK_EXCL)
+               mrdemote(&ip->i_lock);
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrdemote(&ip->i_iolock);
+
+       trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+               if (!(lock_flags & XFS_ILOCK_SHARED))
+                       return !!ip->i_lock.mr_writer;
+               return rwsem_is_locked(&ip->i_lock.mr_lock);
+       }
+
+       if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+               if (!(lock_flags & XFS_IOLOCK_SHARED))
+                       return !!ip->i_iolock.mr_writer;
+               return rwsem_is_locked(&ip->i_iolock.mr_lock);
+       }
+
+       ASSERT(0);
+       return 0;
+}
+#endif
+
+void
+__xfs_iflock(
+       struct xfs_inode        *ip)
+{
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+       do {
+               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_isiflocked(ip))
+                       io_schedule();
+       } while (!xfs_iflock_nowait(ip));
+
+       finish_wait(wq, &wait.wait);
+}
+
  #ifdef DEBUG
  /*
   * Make sure that the extents in the given memory buffer
@@ -131,6 +382,46 @@ xfs_inobp_check(
  }
  #endif
  
+static void
+xfs_inode_buf_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       int             i;
+       int             ni;
+
+       /*
+        * Validate the magic number and version of every inode in the buffer
+        */
+       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+       for (i = 0; i < ni; i++) {
+               int             di_ok;
+               xfs_dinode_t    *dip;
+
+               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                       (i << mp->m_sb.sb_inodelog));
+               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                           XFS_DINODE_GOOD_VERSION(dip->di_version);
+               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                               XFS_ERRTAG_ITOBP_INOTOBP,
+                                               XFS_RANDOM_ITOBP_INOTOBP))) {
+                       xfs_buf_ioerror(bp, EFSCORRUPTED);
+                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+                                            mp, dip);
+#ifdef DEBUG
+                       xfs_emerg(mp,
+                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                               (unsigned long long)bp->b_bn, i,
+                               be16_to_cpu(dip->di_magic));
+                       ASSERT(0);
+#endif
+               }
+       }
+       xfs_inobp_check(mp, bp);
+       bp->b_iodone = NULL;
+       xfs_buf_ioend(bp, 0);
+}
+
  /*
   * This routine is called to map an inode to the buffer containing the on-disk
   * version of the inode.  It returns a pointer to the buffer containing the
@@ -145,71 +436,33 @@ xfs_imap_to_bp(
         struct xfs_mount        *mp,
         struct xfs_trans        *tp,
         struct xfs_imap         *imap,
-       struct xfs_dinode       **dipp,
+       struct xfs_dinode       **dipp,
         struct xfs_buf          **bpp,
         uint                    buf_flags,
         uint                    iget_flags)
  {
         struct xfs_buf          *bp;
         int                     error;
-       int                     i;
-       int                     ni;
  
         buf_flags |= XBF_UNMAPPED;
         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                  (int)imap->im_len, buf_flags, &bp);
+                                  (int)imap->im_len, buf_flags, &bp,
+                                  xfs_inode_buf_verify);
         if (error) {
-               if (error != EAGAIN) {
-                       xfs_warn(mp,
-                               "%s: xfs_trans_read_buf() returned error %d.",
-                               __func__, error);
-               } else {
+               if (error == EAGAIN) {
                         ASSERT(buf_flags & XBF_TRYLOCK);
+                       return error;
                 }
-               return error;
-       }
-
-       /*
-        * Validate the magic number and version of every inode in the buffer
-        * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-        */
-#ifdef DEBUG
-       ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
-#else  /* usual case */
-       ni = 1;
-#endif
  
-       for (i = 0; i < ni; i++) {
-               int             di_ok;
-               xfs_dinode_t    *dip;
+               if (error == EFSCORRUPTED &&
+                   (iget_flags & XFS_IGET_UNTRUSTED))
+                       return XFS_ERROR(EINVAL);
  
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
-               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
-                           XFS_DINODE_GOOD_VERSION(dip->di_version);
-               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                               XFS_ERRTAG_ITOBP_INOTOBP,
-                                               XFS_RANDOM_ITOBP_INOTOBP))) {
-                       if (iget_flags & XFS_IGET_UNTRUSTED) {
-                               xfs_trans_brelse(tp, bp);
-                               return XFS_ERROR(EINVAL);
-                       }
-                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
-                                            mp, dip);
-#ifdef DEBUG
-                       xfs_emerg(mp,
-                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                               (unsigned long long)imap->im_blkno, i,
-                               be16_to_cpu(dip->di_magic));
-                       ASSERT(0);
-#endif
-                       xfs_trans_brelse(tp, bp);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
+               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+                       __func__, error);
+               return error;
         }
  
-       xfs_inobp_check(mp, bp);
-
         *bpp = bp;
         *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
         return 0;
@@ -853,16 +1106,16 @@ xfs_iread_extents(
   * set according to the contents of the given cred structure.
   *
   * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
- * to obtain the in-core version of the allocated inode.  Finally,
- * fill in the inode and log its initial contents.  In this case,
- * ialloc_context would be set to NULL and call_again set to false.
+ * has a free inode available, call xfs_iget() to obtain the in-core
+ * version of the allocated inode.  Finally, fill in the inode and
+ * log its initial contents.  In this case, ialloc_context would be
+ * set to NULL.
   *
- * If xfs_dialloc() does not have an available inode,
- * it will replenish its supply by doing an allocation. Since we can
- * only do one allocation within a transaction without deadlocks, we
- * must commit the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * If xfs_dialloc() does not have an available inode, it will replenish
+ * its supply by doing an allocation. Since we can only do one
+ * allocation within a transaction without deadlocks, we must commit
+ * the current transaction before returning the inode itself.
+ * In this case, therefore, we will set ialloc_context and return.
   * The caller should then commit the current transaction, start a new
   * transaction, and call xfs_ialloc() again to actually get the inode.
   *
@@ -1509,7 +1762,8 @@ xfs_ifree_cluster(
                  * to mark all the active inodes on the buffer stale.
                  */
                 bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
-                                       mp->m_bsize * blks_per_cluster, 0);
+                                       mp->m_bsize * blks_per_cluster,
+                                       XBF_UNMAPPED);
  
                 if (!bp)
                         return ENOMEM;
@@ -3660,3 +3914,40 @@ xfs_iext_irec_update_extoffs(
                 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
         }
  }
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+       /* prealloc/delalloc exists only on regular files */
+       if (!S_ISREG(ip->i_d.di_mode))
+               return false;
+
+       /*
+        * Zero sized files with no cached pages and delalloc blocks will not
+        * have speculative prealloc/delalloc blocks to remove.
+        */
+       if (VFS_I(ip)->i_size == 0 &&
+           VN_CACHED(VFS_I(ip)) == 0 &&
+           ip->i_delayed_blks == 0)
+               return false;
+
+       /* If we haven't read in the extent list, then don't do it now. */
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+               return false;
+
+       /*
+        * Do not free real preallocated or append-only files unless the file
+        * has delalloc blocks and we are forced to remove them.
+        */
+       if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+               if (!force || ip->i_delayed_blks == 0)
+                       return false;
+
+       return true;
+}
+