Merge tag 'for-linus-v3.8-rc1' of git://oss.sgi.com/xfs/xfs

author Linus Torvalds <torvalds@linux-foundation.org>

Wed, 12 Dec 2012 17:19:45 +0000 (09:19 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 12 Dec 2012 17:19:45 +0000 (09:19 -0800)
author Linus Torvalds <torvalds@linux-foundation.org>
Wed, 12 Dec 2012 17:19:45 +0000 (09:19 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 12 Dec 2012 17:19:45 +0000 (09:19 -0800)
diff --git a/Documentation/filesystems/xfs.txt b/Documentation/filesystems/xfs.txt

index 3fc0c31a6f5dc5f8ee1220d9bcf8062038297aa5..3e4b3dd1e046a76b02c45f91c8779ae82e402d1f 100644 (file)
--- a/Documentation/filesystems/xfs.txt
+++ b/Documentation/filesystems/xfs.txt
@@ -43,7 +43,7 @@ When mounting an XFS filesystem, the following options are accepted.
         Issue command to let the block device reclaim space freed by the
         filesystem.  This is useful for SSD devices, thinly provisioned
         LUNs and virtual machine images, but may have a performance
-       impact.  This option is incompatible with the nodelaylog option.
+       impact.
  
    dmapi
         Enable the DMAPI (Data Management API) event callouts.
@@ -72,8 +72,15 @@ When mounting an XFS filesystem, the following options are accepted.
         Indicates that XFS is allowed to create inodes at any location
         in the filesystem, including those which will result in inode
         numbers occupying more than 32 bits of significance.  This is
-       provided for backwards compatibility, but causes problems for
-       backup applications that cannot handle large inode numbers.
+       the default allocation option. Applications which do not handle
+       inode numbers bigger than 32 bits, should use inode32 option.
+
+  inode32
+       Indicates that XFS is limited to create inodes at locations which
+       will not result in inode numbers with more than 32 bits of
+       significance. This is provided for backwards compatibility, since
+       64 bits inode numbers might cause problems for some applications
+       that cannot handle large inode numbers.
  
    largeio/nolargeio
         If "nolargeio" is specified, the optimal I/O reported in
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig

index 6100ec0fa1d453770f41db47e0402daea8271c64..5a7ffe54f5d593591efc37957b7ae1e6cac74235 100644 (file)
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
         tristate "XFS filesystem support"
         depends on BLOCK
         select EXPORTFS
+       select LIBCRC32C
         help
           XFS is a high performance journaling filesystem which originated
           on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile

index d2bf974b1a2f68db6a47901cebd060bb49518bd1..d02201df855b3cb99f4a97fad8491fb6b5c8d658 100644 (file)
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y                         += xfs_aops.o \
                                    xfs_file.o \
                                    xfs_filestream.o \
                                    xfs_fsops.o \
-                                  xfs_fs_subr.o \
                                    xfs_globals.o \
-                                  xfs_iget.o \
+                                  xfs_icache.o \
                                    xfs_ioctl.o \
                                    xfs_iomap.o \
                                    xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y                         += xfs_aops.o \
                                    xfs_message.o \
                                    xfs_mru_cache.o \
                                    xfs_super.o \
-                                  xfs_sync.o \
                                    xfs_xattr.o \
                                    xfs_rename.o \
                                    xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h

index 4732d71262cc3f6f3aca0f143a96f81094bf5a22..104db0f3bed6729ccfdcaca85899783375c9bb14 100644 (file)
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
  extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
  extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
  
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+       memcpy(dst, src, sizeof(uuid_t));
+}
+
  #endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h

index 44d65c1533c045d219af0af4d6d64756a9a448c9..f2aeedb6a579f0e45246feade8ca68dd4a8652ff 100644 (file)
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
  extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                         xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
  
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+
  /*
   * Size of the unlinked inode hash table in the agi.
   */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
  extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
                                 xfs_agnumber_t agno, struct xfs_buf **bpp);
  
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
  /*
   * The third a.g. block contains the a.g. freelist, an array
   * of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
  #define XFS_ICI_NO_TAG         (-1)    /* special flag for an untagged lookup
                                            in xfs_inode_ag_iterator */
  #define XFS_ICI_RECLAIM_TAG    0       /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG  1       /* inode has blocks beyond EOF */
  
  #define        XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
  #define        XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c

index 335206a9c6985fde106d164543efee515bba1433..393055fe3aef6c199537eb23db1a02de8f2e9f43 100644 (file)
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
         return 0;
  }
  
+static void
+xfs_agfl_verify(
+       struct xfs_buf  *bp)
+{
+#ifdef WHEN_CRCS_COME_ALONG
+       /*
+        * we cannot actually do any verification of the AGFL because mkfs does
+        * not initialise the AGFL to zero or NULL. Hence the only valid part of
+        * the AGFL is what the AGF says is active. We can't get to the AGF, so
+        * we can't verify just those entries are valid.
+        *
+        * This problem goes away when the CRC format change comes along as that
+        * requires the AGFL to be initialised by mkfs. At that point, we can
+        * verify the blocks in the agfl -active or not- lie within the bounds
+        * of the AG. Until then, just leave this check ifdef'd out.
+        */
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+       int             agfl_ok = 1;
+
+       int             i;
+
+       for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+               if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+                   be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+                       agfl_ok = 0;
+       }
+
+       if (!agfl_ok) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+#endif
+}
+
+static void
+xfs_agfl_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_agfl_verify(bp);
+}
+
+static void
+xfs_agfl_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_agfl_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+       .verify_read = xfs_agfl_read_verify,
+       .verify_write = xfs_agfl_write_verify,
+};
+
  /*
   * Read in the allocation group free block array.
   */
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
         error = xfs_trans_read_buf(
                         mp, tp, mp->m_ddev_targp,
                         XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                       XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
         if (error)
                 return error;
         ASSERT(!xfs_buf_geterror(bp));
@@ -2091,6 +2145,63 @@ xfs_alloc_put_freelist(
         return 0;
  }
  
+static void
+xfs_agf_verify(
+       struct xfs_buf  *bp)
+ {
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_agf  *agf;
+       int             agf_ok;
+
+       agf = XFS_BUF_TO_AGF(bp);
+
+       agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+               XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+               be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+               be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+               be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+               be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+
+       /*
+        * during growfs operations, the perag is not fully initialised,
+        * so we can't use it for any useful checking. growfs ensures we can't
+        * use it by using uncached buffers that don't have the perag attached
+        * so we can detect and avoid this problem.
+        */
+       if (bp->b_pag)
+               agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
+                                               bp->b_pag->pag_agno;
+
+       if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+               agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                               be32_to_cpu(agf->agf_length);
+
+       if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+                       XFS_RANDOM_ALLOC_READ_AGF))) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_agf_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_agf_verify(bp);
+}
+
+static void
+xfs_agf_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_agf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+       .verify_read = xfs_agf_read_verify,
+       .verify_write = xfs_agf_write_verify,
+};
+
  /*
   * Read in the allocation group header (free/alloc section).
   */
@@ -2102,44 +2213,19 @@ xfs_read_agf(
         int                     flags,  /* XFS_BUF_ */
         struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
  {
-       struct xfs_agf  *agf;           /* ag freelist header */
-       int             agf_ok;         /* set if agf is consistent */
         int             error;
  
         ASSERT(agno != NULLAGNUMBER);
         error = xfs_trans_read_buf(
                         mp, tp, mp->m_ddev_targp,
                         XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1), flags, bpp);
+                       XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
         if (error)
                 return error;
         if (!*bpp)
                 return 0;
  
         ASSERT(!(*bpp)->b_error);
-       agf = XFS_BUF_TO_AGF(*bpp);
-
-       /*
-        * Validate the magic number of the agf block.
-        */
-       agf_ok =
-               agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-               XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-               be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-               be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-               be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-               be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
-               be32_to_cpu(agf->agf_seqno) == agno;
-       if (xfs_sb_version_haslazysbcount(&mp->m_sb))
-               agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
-                                               be32_to_cpu(agf->agf_length);
-       if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
-                       XFS_RANDOM_ALLOC_READ_AGF))) {
-               XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
-                                    XFS_ERRLEVEL_LOW, mp, agf);
-               xfs_trans_brelse(tp, *bpp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
         xfs_buf_set_ref(*bpp, XFS_AGF_REF);
         return 0;
  }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h

index feacb061bab78bb3492ee0ff37eed5cc0fb95894..99d0a61015587f7b8400bb8519bec6717b91c1cf 100644 (file)
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
         xfs_extlen_t            *len,   /* output: length of extent */
         int                     *stat); /* output: success/failure */
  
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+
  #endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c

index f7876c6d616553d222295624a20f4c168c4c021d..b1ddef6b26892f4f427190833c1265d97be4f57b 100644 (file)
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -272,6 +272,82 @@ xfs_allocbt_key_diff(
         return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
  }
  
+static void
+xfs_allocbt_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       struct xfs_perag        *pag = bp->b_pag;
+       unsigned int            level;
+       int                     sblock_ok; /* block passes checks */
+
+       /*
+        * magic number and level verification
+        *
+        * During growfs operations, we can't verify the exact level as the
+        * perag is not fully initialised and hence not attached to the buffer.
+        * In this case, check against the maximum tree depth.
+        */
+       level = be16_to_cpu(block->bb_level);
+       switch (block->bb_magic) {
+       case cpu_to_be32(XFS_ABTB_MAGIC):
+               if (pag)
+                       sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
+               else
+                       sblock_ok = level < mp->m_ag_maxlevels;
+               break;
+       case cpu_to_be32(XFS_ABTC_MAGIC):
+               if (pag)
+                       sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
+               else
+                       sblock_ok = level < mp->m_ag_maxlevels;
+               break;
+       default:
+               sblock_ok = 0;
+               break;
+       }
+
+       /* numrecs verification */
+       sblock_ok = sblock_ok &&
+               be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+
+       /* sibling pointer verification */
+       sblock_ok = sblock_ok &&
+               (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+               block->bb_u.s.bb_leftsib &&
+               (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+               block->bb_u.s.bb_rightsib;
+
+       if (!sblock_ok) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_allocbt_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_allocbt_verify(bp);
+}
+
+static void
+xfs_allocbt_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_allocbt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+       .verify_read = xfs_allocbt_read_verify,
+       .verify_write = xfs_allocbt_write_verify,
+};
+
+
  #ifdef DEBUG
  STATIC int
  xfs_allocbt_keys_inorder(
@@ -327,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
         .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
         .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
         .key_diff               = xfs_allocbt_key_diff,
+       .buf_ops                = &xfs_allocbt_buf_ops,
  #ifdef DEBUG
         .keys_inorder           = xfs_allocbt_keys_inorder,
         .recs_inorder           = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h

index 359fb86ed8769b6ba5259a5be6959fd40c977b46..7e89a2b429ddcb350a847ede80f7c622000d71c8 100644 (file)
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
                 xfs_agnumber_t, xfs_btnum_t);
  extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
  
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+
  #endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c

index e57e2daa357c34fc9634871a1d41dd76569cf871..4111a40ebe1a17dde31f89e5154c0d257c6e5d6c 100644 (file)
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
         ioend->io_append_trans = tp;
  
         /*
-        * We will pass freeze protection with a transaction.  So tell lockdep
+        * We may pass freeze protection with a transaction.  So tell lockdep
          * we released it.
          */
         rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
         xfs_fsize_t             isize;
  
         /*
-        * The transaction was allocated in the I/O submission thread,
-        * thus we need to mark ourselves as beeing in a transaction
-        * manually.
+        * The transaction may have been allocated in the I/O submission thread,
+        * thus we need to mark ourselves as beeing in a transaction manually.
+        * Similarly for freeze protection.
          */
         current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+       rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                          0, 1, _THIS_IP_);
  
         xfs_ilock(ip, XFS_ILOCK_EXCL);
         isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
  
                 if (ioend->io_type == XFS_IO_UNWRITTEN)
                         queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-               else if (ioend->io_append_trans)
+               else if (ioend->io_append_trans ||
+                        (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
                         queue_work(mp->m_data_workqueue, &ioend->io_work);
                 else
                         xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
         struct xfs_inode *ip = XFS_I(ioend->io_inode);
         int             error = 0;
  
-       if (ioend->io_append_trans) {
-               /*
-                * We've got freeze protection passed with the transaction.
-                * Tell lockdep about it.
-                */
-               rwsem_acquire_read(
-                       &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                       0, 1, _THIS_IP_);
-       }
         if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                 ioend->io_error = -EIO;
                 goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
          * range to normal written extens after the data I/O has finished.
          */
         if (ioend->io_type == XFS_IO_UNWRITTEN) {
+               error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+                                                 ioend->io_size);
+       } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
                 /*
-                * For buffered I/O we never preallocate a transaction when
-                * doing the unwritten extent conversion, but for direct I/O
-                * we do not know if we are converting an unwritten extent
-                * or not at the point where we preallocate the transaction.
+                * For direct I/O we do not know if we need to allocate blocks
+                * or not so we can't preallocate an append transaction as that
+                * results in nested reservations and log space deadlocks. Hence
+                * allocate the transaction here. While this is sub-optimal and
+                * can block IO completion for some time, we're stuck with doing
+                * it this way until we can pass the ioend to the direct IO
+                * allocation callbacks and avoid nesting that way.
                  */
-               if (ioend->io_append_trans) {
-                       ASSERT(ioend->io_isdirect);
-
-                       current_set_flags_nested(
-                               &ioend->io_append_trans->t_pflags, PF_FSTRANS);
-                       xfs_trans_cancel(ioend->io_append_trans, 0);
-               }
-
-               error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
-                                                ioend->io_size);
-               if (error) {
-                       ioend->io_error = -error;
+               error = xfs_setfilesize_trans_alloc(ioend);
+               if (error)
                         goto done;
-               }
+               error = xfs_setfilesize(ioend);
         } else if (ioend->io_append_trans) {
                 error = xfs_setfilesize(ioend);
-               if (error)
-                       ioend->io_error = -error;
         } else {
                 ASSERT(!xfs_ioend_is_append(ioend));
         }
  
  done:
+       if (error)
+               ioend->io_error = -error;
         xfs_destroy_ioend(ioend);
  }
  
@@ -1432,25 +1422,21 @@ xfs_vm_direct_IO(
                 size_t size = iov_length(iov, nr_segs);
  
                 /*
-                * We need to preallocate a transaction for a size update
-                * here.  In the case that this write both updates the size
-                * and converts at least on unwritten extent we will cancel
-                * the still clean transaction after the I/O has finished.
+                * We cannot preallocate a size update transaction here as we
+                * don't know whether allocation is necessary or not. Hence we
+                * can only tell IO completion that one is necessary if we are
+                * not doing unwritten extent conversion.
                  */
                 iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
-               if (offset + size > XFS_I(inode)->i_d.di_size) {
-                       ret = xfs_setfilesize_trans_alloc(ioend);
-                       if (ret)
-                               goto out_destroy_ioend;
+               if (offset + size > XFS_I(inode)->i_d.di_size)
                         ioend->io_isdirect = 1;
-               }
  
                 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                             offset, nr_segs,
                                             xfs_get_blocks_direct,
                                             xfs_end_io_direct_write, NULL, 0);
                 if (ret != -EIOCBQUEUED && iocb->private)
-                       goto out_trans_cancel;
+                       goto out_destroy_ioend;
         } else {
                 ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                             offset, nr_segs,
@@ -1460,15 +1446,6 @@ xfs_vm_direct_IO(
  
         return ret;
  
-out_trans_cancel:
-       if (ioend->io_append_trans) {
-               current_set_flags_nested(&ioend->io_append_trans->t_pflags,
-                                        PF_FSTRANS);
-               rwsem_acquire_read(
-                       &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                       0, 1, _THIS_IP_);
-               xfs_trans_cancel(ioend->io_append_trans, 0);
-       }
  out_destroy_ioend:
         xfs_destroy_ioend(ioend);
         return ret;
@@ -1641,7 +1618,7 @@ xfs_vm_bmap(
  
         trace_xfs_vm_bmap(XFS_I(inode));
         xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+       filemap_write_and_wait(mapping);
         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
         return generic_block_bmap(mapping, block, xfs_get_blocks);
  }
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c

index 0ca1f0be62d262f8eb23873f48c81ab864995fe8..aaf472532b3c37beb11c33ac56725ce6eda8c0ea 100644 (file)
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
          */
         dp = args->dp;
         args->blkno = 0;
-       error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-                                            XFS_ATTR_FORK);
+       error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
         if (error)
-               return(error);
-       ASSERT(bp != NULL);
+               return error;
  
         /*
          * Look up the given attribute in the leaf block.  Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                  * Read in the block containing the "old" attr, then
                  * remove the "old" attr from that block (neat, huh!)
                  */
-               error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
-                                                    &bp, XFS_ATTR_FORK);
+               error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
+                                          -1, &bp);
                 if (error)
-                       return(error);
-               ASSERT(bp != NULL);
-               (void)xfs_attr_leaf_remove(bp, args);
+                       return error;
+
+               xfs_attr_leaf_remove(bp, args);
  
                 /*
                  * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
          */
         dp = args->dp;
         args->blkno = 0;
-       error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-                                            XFS_ATTR_FORK);
-       if (error) {
-               return(error);
-       }
+       error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
+               return error;
  
-       ASSERT(bp != NULL);
         error = xfs_attr_leaf_lookup_int(bp, args);
         if (error == ENOATTR) {
                 xfs_trans_brelse(args->trans, bp);
                 return(error);
         }
  
-       (void)xfs_attr_leaf_remove(bp, args);
+       xfs_attr_leaf_remove(bp, args);
  
         /*
          * If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
         struct xfs_buf *bp;
         int error;
  
+       trace_xfs_attr_leaf_get(args);
+
         args->blkno = 0;
-       error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-                                            XFS_ATTR_FORK);
+       error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
         if (error)
-               return(error);
-       ASSERT(bp != NULL);
+               return error;
  
         error = xfs_attr_leaf_lookup_int(bp, args);
         if (error != EEXIST)  {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
  STATIC int
  xfs_attr_leaf_list(xfs_attr_list_context_t *context)
  {
-       xfs_attr_leafblock_t *leaf;
         int error;
         struct xfs_buf *bp;
  
+       trace_xfs_attr_leaf_list(context);
+
         context->cursor->blkno = 0;
-       error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+       error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
         if (error)
                 return XFS_ERROR(error);
-       ASSERT(bp != NULL);
-       leaf = bp->b_addr;
-       if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-               XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
-                                    context->dp->i_mount, leaf);
-               xfs_trans_brelse(NULL, bp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
  
         error = xfs_attr_leaf_list_int(bp, context);
         xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                 ASSERT(state->path.blk[0].bp);
                 state->path.blk[0].bp = NULL;
  
-               error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
-                                                    XFS_ATTR_FORK);
+               error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
                 if (error)
                         goto out;
-               ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
-                      cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
  
                 if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
                         xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
         xfs_da_state_blk_t *blk;
         int level;
  
+       trace_xfs_attr_fillstate(state->args);
+
         /*
          * Roll down the "path" in the state structure, storing the on-disk
          * block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
         xfs_da_state_blk_t *blk;
         int level, error;
  
+       trace_xfs_attr_refillstate(state->args);
+
         /*
          * Roll down the "path" in the state structure, storing the on-disk
          * block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
         ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
         for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
                 if (blk->disk_blkno) {
-                       error = xfs_da_read_buf(state->args->trans,
+                       error = xfs_da_node_read(state->args->trans,
                                                 state->args->dp,
                                                 blk->blkno, blk->disk_blkno,
                                                 &blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
         ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
         for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
                 if (blk->disk_blkno) {
-                       error = xfs_da_read_buf(state->args->trans,
+                       error = xfs_da_node_read(state->args->trans,
                                                 state->args->dp,
                                                 blk->blkno, blk->disk_blkno,
                                                 &blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
         int error, retval;
         int i;
  
+       trace_xfs_attr_node_get(args);
+
         state = xfs_da_state_alloc();
         state->args = args;
         state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
         int error, i;
         struct xfs_buf *bp;
  
+       trace_xfs_attr_node_list(context);
+
         cursor = context->cursor;
         cursor->initted = 1;
  
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
          */
         bp = NULL;
         if (cursor->blkno > 0) {
-               error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+               error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
                                               &bp, XFS_ATTR_FORK);
                 if ((error != 0) && (error != EFSCORRUPTED))
                         return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
         if (bp == NULL) {
                 cursor->blkno = 0;
                 for (;;) {
-                       error = xfs_da_read_buf(NULL, context->dp,
+                       error = xfs_da_node_read(NULL, context->dp,
                                                       cursor->blkno, -1, &bp,
                                                       XFS_ATTR_FORK);
                         if (error)
                                 return(error);
-                       if (unlikely(bp == NULL)) {
-                               XFS_ERROR_REPORT("xfs_attr_node_list(2)",
-                                                XFS_ERRLEVEL_LOW,
-                                                context->dp->i_mount);
-                               return(XFS_ERROR(EFSCORRUPTED));
-                       }
                         node = bp->b_addr;
                         if (node->hdr.info.magic ==
                             cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
          */
         for (;;) {
                 leaf = bp->b_addr;
-               if (unlikely(leaf->hdr.info.magic !=
-                            cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-                       XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
-                                            XFS_ERRLEVEL_LOW,
-                                            context->dp->i_mount, leaf);
-                       xfs_trans_brelse(NULL, bp);
-                       return(XFS_ERROR(EFSCORRUPTED));
-               }
                 error = xfs_attr_leaf_list_int(bp, context);
                 if (error) {
                         xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                         break;
                 cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
                 xfs_trans_brelse(NULL, bp);
-               error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
-                                             &bp, XFS_ATTR_FORK);
+               error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
+                                          &bp);
                 if (error)
-                       return(error);
-               if (unlikely((bp == NULL))) {
-                       XFS_ERROR_REPORT("xfs_attr_node_list(5)",
-                                        XFS_ERRLEVEL_LOW,
-                                        context->dp->i_mount);
-                       return(XFS_ERROR(EFSCORRUPTED));
-               }
+                       return error;
         }
         xfs_trans_brelse(NULL, bp);
         return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
         int nmap, error, tmp, valuelen, blkcnt, i;
         xfs_dablk_t lblkno;
  
+       trace_xfs_attr_rmtval_get(args);
+
         ASSERT(!(args->flags & ATTR_KERNOVAL));
  
         mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                         dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
                         blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                         error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-                                                  dblkno, blkcnt, 0, &bp);
+                                                  dblkno, blkcnt, 0, &bp, NULL);
                         if (error)
                                 return(error);
  
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
         xfs_dablk_t lblkno;
         int blkcnt, valuelen, nmap, error, tmp, committed;
  
+       trace_xfs_attr_rmtval_set(args);
+
         dp = args->dp;
         mp = dp->i_mount;
         src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
         xfs_dablk_t lblkno;
         int valuelen, blkcnt, nmap, error, done, committed;
  
+       trace_xfs_attr_rmtval_remove(args);
+
         mp = args->dp->i_mount;
  
         /*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c

index 70eec1829776309b3cba83e9a660aa6de5ad9693..ee24993c7d121acacb06e9258f3f94655d814457 100644 (file)
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
                                 struct xfs_buf **bpp);
  STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
                                   xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
+                                 struct xfs_buf *leaf_buffer);
  STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
                                                    xfs_da_state_blk_t *blk1,
                                                    xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
                                          xfs_mount_t *mp);
  STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
  
+static void
+xfs_attr_leaf_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
+       int                     block_ok = 0;
+
+       block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+       if (!block_ok) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_attr_leaf_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_attr_leaf_verify(bp);
+}
+
+static void
+xfs_attr_leaf_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_attr_leaf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
+       .verify_read = xfs_attr_leaf_read_verify,
+       .verify_write = xfs_attr_leaf_write_verify,
+};
+
+int
+xfs_attr_leaf_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                               XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+}
+
  /*========================================================================
   * Namespace helper routines
   *========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
         error = xfs_da_grow_inode(args, &blkno);
         if (error)
                 goto out;
-       error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
-                                            XFS_ATTR_FORK);
+       error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
         if (error)
                 goto out;
-       ASSERT(bp1 != NULL);
+
         bp2 = NULL;
         error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
                                             XFS_ATTR_FORK);
         if (error)
                 goto out;
-       ASSERT(bp2 != NULL);
+       bp2->b_ops = bp1->b_ops;
         memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
         bp1 = NULL;
         xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
                                             XFS_ATTR_FORK);
         if (error)
                 return(error);
-       ASSERT(bp != NULL);
+       bp->b_ops = &xfs_attr_leaf_buf_ops;
         leaf = bp->b_addr;
         memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
         hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
          * Compact the entries to coalesce free space.
          * This may change the hdr->count via dropping INCOMPLETE entries.
          */
-       xfs_attr_leaf_compact(args->trans, bp);
+       xfs_attr_leaf_compact(args, bp);
  
         /*
          * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
         xfs_mount_t *mp;
         int tmp, i;
  
+       trace_xfs_attr_leaf_add_work(args);
+
         leaf = bp->b_addr;
         ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
         hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
   */
  STATIC void
  xfs_attr_leaf_compact(
-       struct xfs_trans *trans,
-       struct xfs_buf  *bp)
+       struct xfs_da_args      *args,
+       struct xfs_buf          *bp)
  {
-       xfs_attr_leafblock_t *leaf_s, *leaf_d;
-       xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
-       xfs_mount_t *mp;
-       char *tmpbuffer;
+       xfs_attr_leafblock_t    *leaf_s, *leaf_d;
+       xfs_attr_leaf_hdr_t     *hdr_s, *hdr_d;
+       struct xfs_trans        *trans = args->trans;
+       struct xfs_mount        *mp = trans->t_mountp;
+       char                    *tmpbuffer;
+
+       trace_xfs_attr_leaf_compact(args);
  
-       mp = trans->t_mountp;
         tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
         ASSERT(tmpbuffer != NULL);
         memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1345,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                 max  = be16_to_cpu(hdr2->firstused)
                                                 - sizeof(xfs_attr_leaf_hdr_t);
                 max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
-               if (space > max) {
-                       xfs_attr_leaf_compact(args->trans, blk2->bp);
-               }
+               if (space > max)
+                       xfs_attr_leaf_compact(args, blk2->bp);
  
                 /*
                  * Move high entries from leaf1 to low end of leaf2.
@@ -1378,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                 max  = be16_to_cpu(hdr1->firstused)
                                                 - sizeof(xfs_attr_leaf_hdr_t);
                 max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
-               if (space > max) {
-                       xfs_attr_leaf_compact(args->trans, blk1->bp);
-               }
+               if (space > max)
+                       xfs_attr_leaf_compact(args, blk1->bp);
  
                 /*
                  * Move low entries from leaf2 to high end of leaf1.
@@ -1577,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
         xfs_dablk_t blkno;
         struct xfs_buf *bp;
  
+       trace_xfs_attr_leaf_toosmall(state->args);
+
         /*
          * Check for the degenerate case of the block being over 50% full.
          * If so, it's not worth even looking to see if we might be able
@@ -1636,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
                         blkno = be32_to_cpu(info->back);
                 if (blkno == 0)
                         continue;
-               error = xfs_da_read_buf(state->args->trans, state->args->dp,
-                                       blkno, -1, &bp, XFS_ATTR_FORK);
+               error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
+                                       blkno, -1, &bp);
                 if (error)
                         return(error);
-               ASSERT(bp != NULL);
  
                 leaf = (xfs_attr_leafblock_t *)info;
                 count  = be16_to_cpu(leaf->hdr.count);
                 bytes  = state->blocksize - (state->blocksize>>2);
                 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
                 leaf = bp->b_addr;
-               ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
                 count += be16_to_cpu(leaf->hdr.count);
                 bytes -= be16_to_cpu(leaf->hdr.usedbytes);
                 bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1702,6 +1750,8 @@ xfs_attr_leaf_remove(
         int tablesize, tmp, i;
         xfs_mount_t *mp;
  
+       trace_xfs_attr_leaf_remove(args);
+
         leaf = bp->b_addr;
         ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
         hdr = &leaf->hdr;
@@ -2511,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
         /*
          * Set up the operation.
          */
-       error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-                                            XFS_ATTR_FORK);
-       if (error) {
+       error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
                 return(error);
-       }
-       ASSERT(bp != NULL);
  
         leaf = bp->b_addr;
-       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
         ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
         ASSERT(args->index >= 0);
         entry = &leaf->entries[ args->index ];
@@ -2576,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
         /*
          * Set up the operation.
          */
-       error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
-                                            XFS_ATTR_FORK);
-       if (error) {
+       error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+       if (error)
                 return(error);
-       }
-       ASSERT(bp != NULL);
  
         leaf = bp->b_addr;
-       ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
         ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
         ASSERT(args->index >= 0);
         entry = &leaf->entries[ args->index ];
@@ -2633,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
         /*
          * Read the block containing the "old" attr
          */
-       error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
-                                            XFS_ATTR_FORK);
-       if (error) {
-               return(error);
-       }
-       ASSERT(bp1 != NULL);
+       error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+       if (error)
+               return error;
  
         /*
          * Read the block containing the "new" attr, if it is different
          */
         if (args->blkno2 != args->blkno) {
-               error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
-                                       -1, &bp2, XFS_ATTR_FORK);
-               if (error) {
-                       return(error);
-               }
-               ASSERT(bp2 != NULL);
+               error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
+                                          -1, &bp2);
+               if (error)
+                       return error;
         } else {
                 bp2 = bp1;
         }
  
         leaf1 = bp1->b_addr;
-       ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
         ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
         ASSERT(args->index >= 0);
         entry1 = &leaf1->entries[ args->index ];
  
         leaf2 = bp2->b_addr;
-       ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
         ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
         ASSERT(args->index2 >= 0);
         entry2 = &leaf2->entries[ args->index2 ];
@@ -2746,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
          * the extents in reverse order the extent containing
          * block 0 must still be there.
          */
-       error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+       error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
         if (error)
                 return(error);
         blkno = XFS_BUF_ADDR(bp);
@@ -2831,7 +2866,7 @@ xfs_attr_node_inactive(
                  * traversal of the tree so we may deal with many blocks
                  * before we come back to this one.
                  */
-               error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+               error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
                                                 XFS_ATTR_FORK);
                 if (error)
                         return(error);
@@ -2872,8 +2907,8 @@ xfs_attr_node_inactive(
                  * child block number.
                  */
                 if ((i+1) < count) {
-                       error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
-                               &bp, XFS_ATTR_FORK);
+                       error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
+                                                &bp, XFS_ATTR_FORK);
                         if (error)
                                 return(error);
                         child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h

index dea17722945e1646a54498047c31c6918421f8ae..77de139a58f0277991cd2d1ede0e045def90ec4b 100644 (file)
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int        xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
                                    struct xfs_buf *leaf2_bp);
  int    xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
                                         int *local);
+int    xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                       xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                       struct xfs_buf **bpp);
+
+extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
+
  #endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c

index 83d0cf3df930794c307c4adcb33dceb8a5415f7e..0e92d12765d2670146b0325cd1215b449fb60dfe 100644 (file)
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents(
         if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
                 return error;
  #endif
-       if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
-                       XFS_BMAP_BTREE_REF)))
+       error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+                               &xfs_bmbt_buf_ops);
+       if (error)
                 return error;
         cblock = XFS_BUF_TO_BLOCK(cbp);
         if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3123,6 +3124,7 @@ xfs_bmap_extents_to_btree(
         /*
          * Fill in the child block.
          */
+       abp->b_ops = &xfs_bmbt_buf_ops;
         ablock = XFS_BUF_TO_BLOCK(abp);
         ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
         ablock->bb_level = 0;
@@ -3269,6 +3271,7 @@ xfs_bmap_local_to_extents(
                 ASSERT(args.len == 1);
                 *firstblock = args.fsbno;
                 bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+               bp->b_ops = &xfs_bmbt_buf_ops;
                 memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
                 xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
                 xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4078,8 +4081,9 @@ xfs_bmap_read_extents(
          * pointer (leftmost) at each level.
          */
         while (level-- > 0) {
-               if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                               XFS_BMAP_BTREE_REF)))
+               error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                               XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+               if (error)
                         return error;
                 block = XFS_BUF_TO_BLOCK(bp);
                 XFS_WANT_CORRUPTED_GOTO(
@@ -4124,7 +4128,8 @@ xfs_bmap_read_extents(
                  */
                 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                 if (nextbno != NULLFSBLOCK)
-                       xfs_btree_reada_bufl(mp, nextbno, 1);
+                       xfs_btree_reada_bufl(mp, nextbno, 1,
+                                            &xfs_bmbt_buf_ops);
                 /*
                  * Copy records into the extent records.
                  */
@@ -4156,8 +4161,9 @@ xfs_bmap_read_extents(
                  */
                 if (bno == NULLFSBLOCK)
                         break;
-               if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                               XFS_BMAP_BTREE_REF)))
+               error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                               XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+               if (error)
                         return error;
                 block = XFS_BUF_TO_BLOCK(bp);
         }
@@ -5599,7 +5605,7 @@ xfs_getbmap(
         xfs_ilock(ip, XFS_IOLOCK_SHARED);
         if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
                 if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
-                       error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+                       error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
                         if (error)
                                 goto out_unlock_iolock;
                 }
@@ -5868,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
          */
         while (level-- > 0) {
                 /* See if buf is in cur first */
+               bp_release = 0;
                 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-               if (bp) {
-                       bp_release = 0;
-               } else {
+               if (!bp) {
                         bp_release = 1;
+                       error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               goto error_norelse;
                 }
-               if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                               XFS_BMAP_BTREE_REF)))
-                       goto error_norelse;
                 block = XFS_BUF_TO_BLOCK(bp);
                 XFS_WANT_CORRUPTED_GOTO(
                         xfs_bmap_sanity_check(mp, bp, level),
@@ -5953,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
                 if (bno == NULLFSBLOCK)
                         break;
  
+               bp_release = 0;
                 bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-               if (bp) {
-                       bp_release = 0;
-               } else {
+               if (!bp) {
                         bp_release = 1;
+                       error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
+                               goto error_norelse;
                 }
-               if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                               XFS_BMAP_BTREE_REF)))
-                       goto error_norelse;
                 block = XFS_BUF_TO_BLOCK(bp);
         }
         if (bp_release) {
@@ -6052,7 +6060,9 @@ xfs_bmap_count_tree(
         struct xfs_btree_block  *block, *nextblock;
         int                     numrecs;
  
-       if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+       if (error)
                 return error;
         *count += 1;
         block = XFS_BUF_TO_BLOCK(bp);
@@ -6061,8 +6071,10 @@ xfs_bmap_count_tree(
                 /* Not at node above leaves, count this level of nodes */
                 nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                 while (nextbno != NULLFSBLOCK) {
-                       if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
-                               0, &nbp, XFS_BMAP_BTREE_REF)))
+                       error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
                                 return error;
                         *count += 1;
                         nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6091,8 +6103,10 @@ xfs_bmap_count_tree(
                         if (nextbno == NULLFSBLOCK)
                                 break;
                         bno = nextbno;
-                       if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                               XFS_BMAP_BTREE_REF)))
+                       error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                                               XFS_BMAP_BTREE_REF,
+                                               &xfs_bmbt_buf_ops);
+                       if (error)
                                 return error;
                         *count += 1;
                         block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c

index 862084a47a7e269a8ab4357d5bf75906688ee86c..061b45cbe61461a5aa1f1d93f25deb2ec8b23d01 100644 (file)
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
  #include "xfs_bmap.h"
  #include "xfs_error.h"
  #include "xfs_quota.h"
+#include "xfs_trace.h"
  
  /*
   * Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
                                       cur->bc_rec.b.br_startoff;
  }
  
+static void
+xfs_bmbt_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       unsigned int            level;
+       int                     lblock_ok; /* block passes checks */
+
+       /* magic number and level verification.
+        *
+        * We don't know waht fork we belong to, so just verify that the level
+        * is less than the maximum of the two. Later checks will be more
+        * precise.
+        */
+       level = be16_to_cpu(block->bb_level);
+       lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
+                   level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
+
+       /* numrecs verification */
+       lblock_ok = lblock_ok &&
+               be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+
+       /* sibling pointer verification */
+       lblock_ok = lblock_ok &&
+               block->bb_u.l.bb_leftsib &&
+               (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+               block->bb_u.l.bb_rightsib &&
+               (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+                XFS_FSB_SANITY_CHECK(mp,
+                       be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+       if (!lblock_ok) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_bmbt_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_bmbt_verify(bp);
+}
+
+static void
+xfs_bmbt_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_bmbt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+       .verify_read = xfs_bmbt_read_verify,
+       .verify_write = xfs_bmbt_write_verify,
+};
+
+
  #ifdef DEBUG
  STATIC int
  xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
         .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
         .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
         .key_diff               = xfs_bmbt_key_diff,
+       .buf_ops                = &xfs_bmbt_buf_ops,
  #ifdef DEBUG
         .keys_inorder           = xfs_bmbt_keys_inorder,
         .recs_inorder           = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h

index 0e66c4ea0f8581ef06f2248d72953a84b4cbafbf..88469ca086960986d4768f03b43979d517326c75 100644 (file)
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
  extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
                 struct xfs_trans *, struct xfs_inode *, int);
  
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
  
  #endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c

index e53e317b1582fecbc02f06fbc5225cedf1c28a5f..db010408d7011b39e672ada10dd3e5429e98dcce 100644 (file)
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
         for (i = 0; i < new->bc_nlevels; i++) {
                 new->bc_ptrs[i] = cur->bc_ptrs[i];
                 new->bc_ra[i] = cur->bc_ra[i];
-               if ((bp = cur->bc_bufs[i])) {
-                       if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
-                               XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+               bp = cur->bc_bufs[i];
+               if (bp) {
+                       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                                  XFS_BUF_ADDR(bp), mp->m_bsize,
+                                                  0, &bp,
+                                                  cur->bc_ops->buf_ops);
+                       if (error) {
                                 xfs_btree_del_cursor(new, error);
                                 *ncur = NULL;
                                 return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
   * Get a buffer for the block, return it read in.
   * Long-form addressing.
   */
-int                                    /* error */
+int
  xfs_btree_read_bufl(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_trans_t     *tp,            /* transaction pointer */
-       xfs_fsblock_t   fsbno,          /* file system block number */
-       uint            lock,           /* lock flags for read_buf */
-       xfs_buf_t       **bpp,          /* buffer for fsbno */
-       int             refval)         /* ref count value for buffer */
-{
-       xfs_buf_t       *bp;            /* return value */
+       struct xfs_mount        *mp,            /* file system mount point */
+       struct xfs_trans        *tp,            /* transaction pointer */
+       xfs_fsblock_t           fsbno,          /* file system block number */
+       uint                    lock,           /* lock flags for read_buf */
+       struct xfs_buf          **bpp,          /* buffer for fsbno */
+       int                     refval,         /* ref count value for buffer */
+       const struct xfs_buf_ops *ops)
+{
+       struct xfs_buf          *bp;            /* return value */
         xfs_daddr_t             d;              /* real disk block address */
-       int             error;
+       int                     error;
  
         ASSERT(fsbno != NULLFSBLOCK);
         d = XFS_FSB_TO_DADDR(mp, fsbno);
-       if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-                       mp->m_bsize, lock, &bp))) {
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+                                  mp->m_bsize, lock, &bp, ops);
+       if (error)
                 return error;
-       }
         ASSERT(!xfs_buf_geterror(bp));
         if (bp)
                 xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
  /* ARGSUSED */
  void
  xfs_btree_reada_bufl(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_fsblock_t   fsbno,          /* file system block number */
-       xfs_extlen_t    count)          /* count of filesystem blocks */
+       struct xfs_mount        *mp,            /* file system mount point */
+       xfs_fsblock_t           fsbno,          /* file system block number */
+       xfs_extlen_t            count,          /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops)
  {
         xfs_daddr_t             d;
  
         ASSERT(fsbno != NULLFSBLOCK);
         d = XFS_FSB_TO_DADDR(mp, fsbno);
-       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
  }
  
  /*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
  /* ARGSUSED */
  void
  xfs_btree_reada_bufs(
-       xfs_mount_t     *mp,            /* file system mount point */
-       xfs_agnumber_t  agno,           /* allocation group number */
-       xfs_agblock_t   agbno,          /* allocation group block number */
-       xfs_extlen_t    count)          /* count of filesystem blocks */
+       struct xfs_mount        *mp,            /* file system mount point */
+       xfs_agnumber_t          agno,           /* allocation group number */
+       xfs_agblock_t           agbno,          /* allocation group block number */
+       xfs_extlen_t            count,          /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops)
  {
         xfs_daddr_t             d;
  
         ASSERT(agno != NULLAGNUMBER);
         ASSERT(agbno != NULLAGBLOCK);
         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+       xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
  }
  
  STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
         xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
  
         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
-               xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+               xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+                                    cur->bc_ops->buf_ops);
                 rval++;
         }
  
         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
-               xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+               xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+                                    cur->bc_ops->buf_ops);
                 rval++;
         }
  
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
  
         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                    left, 1);
+                                    left, 1, cur->bc_ops->buf_ops);
                 rval++;
         }
  
         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                    right, 1);
+                                    right, 1, cur->bc_ops->buf_ops);
                 rval++;
         }
  
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
         }
  }
  
-STATIC void
+void
  xfs_btree_init_block(
-       struct xfs_btree_cur    *cur,
-       int                     level,
-       int                     numrecs,
-       struct xfs_btree_block  *new)   /* new block */
+       struct xfs_mount *mp,
+       struct xfs_buf  *bp,
+       __u32           magic,
+       __u16           level,
+       __u16           numrecs,
+       unsigned int    flags)
  {
-       new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+       struct xfs_btree_block  *new = XFS_BUF_TO_BLOCK(bp);
+
+       new->bb_magic = cpu_to_be32(magic);
         new->bb_level = cpu_to_be16(level);
         new->bb_numrecs = cpu_to_be16(numrecs);
  
-       if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+       if (flags & XFS_BTREE_LONG_PTRS) {
                 new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
                 new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
         } else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
         }
  }
  
+STATIC void
+xfs_btree_init_block_cur(
+       struct xfs_btree_cur    *cur,
+       int                     level,
+       int                     numrecs,
+       struct xfs_buf          *bp)
+{
+       xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
+                              level, numrecs, cur->bc_flags);
+}
+
  /*
   * Return true if ptr is the last record in the btree and
   * we need to track updateѕ to this record.  The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
         if (!*bpp)
                 return ENOMEM;
  
+       (*bpp)->b_ops = cur->bc_ops->buf_ops;
         *block = XFS_BUF_TO_BLOCK(*bpp);
         return 0;
  }
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
  
         d = xfs_btree_ptr_to_daddr(cur, ptr);
         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-                                  mp->m_bsize, flags, bpp);
+                                  mp->m_bsize, flags, bpp,
+                                  cur->bc_ops->buf_ops);
         if (error)
                 return error;
  
         ASSERT(!xfs_buf_geterror(*bpp));
-
         xfs_btree_set_refs(cur, *bpp);
         *block = XFS_BUF_TO_BLOCK(*bpp);
-
-       error = xfs_btree_check_block(cur, *block, level, *bpp);
-       if (error)
-               xfs_trans_brelse(cur->bc_tp, *bpp);
-       return error;
+       return 0;
  }
  
  /*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
                 goto error0;
  
         /* Fill in the btree header for the new right block. */
-       xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+       xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
  
         /*
          * Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
                 nptr = 2;
         }
         /* Fill in the new block's btree header and log it. */
-       xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+       xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
                         !xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h

index 5b240de104c0a39955bac019f384990a1e6fb457..f932897194eb33b5be8d8131a98c1941fbea26e7 100644 (file)
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
         __int64_t (*key_diff)(struct xfs_btree_cur *cur,
                               union xfs_btree_key *key);
  
+       const struct xfs_buf_ops        *buf_ops;
+
  #ifdef DEBUG
         /* check that k1 is lower than k2 */
         int     (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
         xfs_fsblock_t           fsbno,  /* file system block number */
         uint                    lock,   /* lock flags for read_buf */
         struct xfs_buf          **bpp,  /* buffer for fsbno */
-       int                     refval);/* ref count value for buffer */
+       int                     refval, /* ref count value for buffer */
+       const struct xfs_buf_ops *ops);
  
  /*
   * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void                                        /* error */
  xfs_btree_reada_bufl(
         struct xfs_mount        *mp,    /* file system mount point */
         xfs_fsblock_t           fsbno,  /* file system block number */
-       xfs_extlen_t            count); /* count of filesystem blocks */
+       xfs_extlen_t            count,  /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops);
  
  /*
   * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
         struct xfs_mount        *mp,    /* file system mount point */
         xfs_agnumber_t          agno,   /* allocation group number */
         xfs_agblock_t           agbno,  /* allocation group block number */
-       xfs_extlen_t            count); /* count of filesystem blocks */
+       xfs_extlen_t            count,  /* count of filesystem blocks */
+       const struct xfs_buf_ops *ops);
  
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+       struct xfs_mount *mp,
+       struct xfs_buf  *bp,
+       __u32           magic,
+       __u16           level,
+       __u16           numrecs,
+       unsigned int    flags);
  
  /*
   * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c

index 4b0b8dd1b7b0ea2bbc431ef523ec5412eb4fd584..26673a0b20e7249a149b22357c0961f86a6ac903 100644 (file)
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
          */
         if (bp->b_flags & XBF_STALE) {
                 ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+               ASSERT(bp->b_iodone == NULL);
                 bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+               bp->b_ops = NULL;
         }
  
         trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -654,7 +656,8 @@ xfs_buf_read_map(
         struct xfs_buftarg      *target,
         struct xfs_buf_map      *map,
         int                     nmaps,
-       xfs_buf_flags_t         flags)
+       xfs_buf_flags_t         flags,
+       const struct xfs_buf_ops *ops)
  {
         struct xfs_buf          *bp;
  
@@ -666,6 +669,7 @@ xfs_buf_read_map(
  
                 if (!XFS_BUF_ISDONE(bp)) {
                         XFS_STATS_INC(xb_get_read);
+                       bp->b_ops = ops;
                         _xfs_buf_read(bp, flags);
                 } else if (flags & XBF_ASYNC) {
                         /*
@@ -691,13 +695,14 @@ void
  xfs_buf_readahead_map(
         struct xfs_buftarg      *target,
         struct xfs_buf_map      *map,
-       int                     nmaps)
+       int                     nmaps,
+       const struct xfs_buf_ops *ops)
  {
         if (bdi_read_congested(target->bt_bdi))
                 return;
  
         xfs_buf_read_map(target, map, nmaps,
-                    XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+                    XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
  }
  
  /*
@@ -709,10 +714,10 @@ xfs_buf_read_uncached(
         struct xfs_buftarg      *target,
         xfs_daddr_t             daddr,
         size_t                  numblks,
-       int                     flags)
+       int                     flags,
+       const struct xfs_buf_ops *ops)
  {
-       xfs_buf_t               *bp;
-       int                     error;
+       struct xfs_buf          *bp;
  
         bp = xfs_buf_get_uncached(target, numblks, flags);
         if (!bp)
@@ -723,13 +728,10 @@ xfs_buf_read_uncached(
         bp->b_bn = daddr;
         bp->b_maps[0].bm_bn = daddr;
         bp->b_flags |= XBF_READ;
+       bp->b_ops = ops;
  
         xfsbdstrat(target->bt_mount, bp);
-       error = xfs_buf_iowait(bp);
-       if (error) {
-               xfs_buf_relse(bp);
-               return NULL;
-       }
+       xfs_buf_iowait(bp);
         return bp;
  }
  
@@ -999,27 +1001,37 @@ STATIC void
  xfs_buf_iodone_work(
         struct work_struct      *work)
  {
-       xfs_buf_t               *bp =
+       struct xfs_buf          *bp =
                 container_of(work, xfs_buf_t, b_iodone_work);
+       bool                    read = !!(bp->b_flags & XBF_READ);
+
+       bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+       if (read && bp->b_ops)
+               bp->b_ops->verify_read(bp);
  
         if (bp->b_iodone)
                 (*(bp->b_iodone))(bp);
         else if (bp->b_flags & XBF_ASYNC)
                 xfs_buf_relse(bp);
+       else {
+               ASSERT(read && bp->b_ops);
+               complete(&bp->b_iowait);
+       }
  }
  
  void
  xfs_buf_ioend(
-       xfs_buf_t               *bp,
-       int                     schedule)
+       struct xfs_buf  *bp,
+       int             schedule)
  {
+       bool            read = !!(bp->b_flags & XBF_READ);
+
         trace_xfs_buf_iodone(bp, _RET_IP_);
  
-       bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
         if (bp->b_error == 0)
                 bp->b_flags |= XBF_DONE;
  
-       if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+       if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
                 if (schedule) {
                         INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
                         queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1039,7 @@ xfs_buf_ioend(
                         xfs_buf_iodone_work(&bp->b_iodone_work);
                 }
         } else {
+               bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
                 complete(&bp->b_iowait);
         }
  }
@@ -1314,6 +1327,20 @@ _xfs_buf_ioapply(
                         rw |= REQ_FUA;
                 if (bp->b_flags & XBF_FLUSH)
                         rw |= REQ_FLUSH;
+
+               /*
+                * Run the write verifier callback function if it exists. If
+                * this function fails it will mark the buffer with an error and
+                * the IO should not be dispatched.
+                */
+               if (bp->b_ops) {
+                       bp->b_ops->verify_write(bp);
+                       if (bp->b_error) {
+                               xfs_force_shutdown(bp->b_target->bt_mount,
+                                                  SHUTDOWN_CORRUPT_INCORE);
+                               return;
+                       }
+               }
         } else if (bp->b_flags & XBF_READ_AHEAD) {
                 rw = READA;
         } else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h

index 7c0b6a0a1557c0d30db4b94e4bc27879d2614537..23f5642480bb1ea5f0f8a42423c607e7d61e9b83 100644 (file)
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
  struct xfs_buf;
  typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
  
+
  #define XB_PAGES       2
  
  struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
  #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
         struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
  
+struct xfs_buf_ops {
+       void (*verify_read)(struct xfs_buf *);
+       void (*verify_write)(struct xfs_buf *);
+};
+
  typedef struct xfs_buf {
         /*
          * first cacheline holds all the fields needed for an uncontended cache
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
         unsigned int            b_page_count;   /* size of page array */
         unsigned int            b_offset;       /* page offset in first page */
         unsigned short          b_error;        /* error code on I/O */
+       const struct xfs_buf_ops        *b_ops;
  
  #ifdef XFS_BUF_LOCK_TRACKING
         int                     b_last_holder;
  #endif
  } xfs_buf_t;
  
-
  /* Finding and Reading Buffers */
  struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
                               struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
                                xfs_buf_flags_t flags);
  struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
                                struct xfs_buf_map *map, int nmaps,
-                              xfs_buf_flags_t flags);
+                              xfs_buf_flags_t flags,
+                              const struct xfs_buf_ops *ops);
  void xfs_buf_readahead_map(struct xfs_buftarg *target,
-                              struct xfs_buf_map *map, int nmaps);
+                              struct xfs_buf_map *map, int nmaps,
+                              const struct xfs_buf_ops *ops);
  
  static inline struct xfs_buf *
  xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
         struct xfs_buftarg      *target,
         xfs_daddr_t             blkno,
         size_t                  numblks,
-       xfs_buf_flags_t         flags)
+       xfs_buf_flags_t         flags,
+       const struct xfs_buf_ops *ops)
  {
         DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-       return xfs_buf_read_map(target, &map, 1, flags);
+       return xfs_buf_read_map(target, &map, 1, flags, ops);
  }
  
  static inline void
  xfs_buf_readahead(
         struct xfs_buftarg      *target,
         xfs_daddr_t             blkno,
-       size_t                  numblks)
+       size_t                  numblks,
+       const struct xfs_buf_ops *ops)
  {
         DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-       return xfs_buf_readahead_map(target, &map, 1);
+       return xfs_buf_readahead_map(target, &map, 1, ops);
  }
  
  struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
  struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
                                 int flags);
  struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
-                               xfs_daddr_t daddr, size_t numblks, int flags);
+                               xfs_daddr_t daddr, size_t numblks, int flags,
+                               const struct xfs_buf_ops *ops);
  void xfs_buf_hold(struct xfs_buf *bp);
  
  /* Releasing Buffers */
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h

new file mode 100644 (file)

index 0000000..fad1676
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED   (~(__uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+       __uint32_t zero = 0;
+       __uint32_t crc;
+
+       /* Calculate CRC up to the checksum. */
+       crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+       /* Skip checksum field */
+       crc = crc32c(crc, &zero, sizeof(__u32));
+
+       /* Calculate the rest of the CRC. */
+       return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+                     length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+       return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+       __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+       *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+       __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+       return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c

index 7bfb7dd334fc0442204ce6e483f1fc5ba4dee65f..4d7696a024184459209d2386f15fa92a42540b98 100644 (file)
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int   xfs_da_blk_unlink(xfs_da_state_t *state,
                                   xfs_da_state_blk_t *save_blk);
  STATIC void    xfs_da_state_kill_altpath(xfs_da_state_t *state);
  
+static void
+xfs_da_node_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_da_node_hdr *hdr = bp->b_addr;
+       int                     block_ok = 0;
+
+       block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
+       block_ok = block_ok &&
+                       be16_to_cpu(hdr->level) > 0 &&
+                       be16_to_cpu(hdr->count) > 0 ;
+       if (!block_ok) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+
+}
+
+static void
+xfs_da_node_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_da_node_verify(bp);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da_node_read_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_da_blkinfo   *info = bp->b_addr;
+
+       switch (be16_to_cpu(info->magic)) {
+               case XFS_DA_NODE_MAGIC:
+                       xfs_da_node_verify(bp);
+                       break;
+               case XFS_ATTR_LEAF_MAGIC:
+                       bp->b_ops = &xfs_attr_leaf_buf_ops;
+                       bp->b_ops->verify_read(bp);
+                       return;
+               case XFS_DIR2_LEAFN_MAGIC:
+                       bp->b_ops = &xfs_dir2_leafn_buf_ops;
+                       bp->b_ops->verify_read(bp);
+                       return;
+               default:
+                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+                                            mp, info);
+                       xfs_buf_ioerror(bp, EFSCORRUPTED);
+                       break;
+       }
+}
+
+const struct xfs_buf_ops xfs_da_node_buf_ops = {
+       .verify_read = xfs_da_node_read_verify,
+       .verify_write = xfs_da_node_write_verify,
+};
+
+
+int
+xfs_da_node_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp,
+       int                     which_fork)
+{
+       return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                       which_fork, &xfs_da_node_buf_ops);
+}
+
  /*========================================================================
   * Routines used for growing the Btree.
   *========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
         xfs_trans_log_buf(tp, bp,
                 XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
  
+       bp->b_ops = &xfs_da_node_buf_ops;
         *bpp = bp;
         return(0);
  }
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
         }
         memcpy(node, oldroot, size);
         xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+       bp->b_ops = blk1->bp->b_ops;
         blk1->bp = bp;
         blk1->blkno = blkno;
  
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
          */
         child = be32_to_cpu(oldroot->btree[0].before);
         ASSERT(child != 0);
-       error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+       error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
                                              args->whichfork);
         if (error)
                 return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
         xfs_da_blkinfo_onlychild_validate(bp->b_addr,
                                         be16_to_cpu(oldroot->hdr.level));
  
+       /*
+        * This could be copying a leaf back into the root block in the case of
+        * there only being a single leaf block left in the tree. Hence we have
+        * to update the b_ops pointer as well to match the buffer type change
+        * that could occur.
+        */
         memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+       root_blk->bp->b_ops = bp->b_ops;
         xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
         error = xfs_da_shrink_inode(args, child, bp);
         return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
         xfs_dablk_t blkno;
         struct xfs_buf *bp;
  
+       trace_xfs_da_node_toosmall(state->args);
+
         /*
          * Check for the degenerate case of the block being over 50% full.
          * If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
                         blkno = be32_to_cpu(info->back);
                 if (blkno == 0)
                         continue;
-               error = xfs_da_read_buf(state->args->trans, state->args->dp,
+               error = xfs_da_node_read(state->args->trans, state->args->dp,
                                         blkno, -1, &bp, state->args->whichfork);
                 if (error)
                         return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
         xfs_dahash_t lasthash=0;
         int level, count;
  
+       trace_xfs_da_fixhashpath(state->args);
+
         level = path->active-1;
         blk = &path->blk[ level ];
         switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                  * Read the next node down in the tree.
                  */
                 blk->blkno = blkno;
-               error = xfs_da_read_buf(args->trans, args->dp, blkno,
+               error = xfs_da_node_read(args->trans, args->dp, blkno,
                                         -1, &blk->bp, args->whichfork);
                 if (error) {
                         blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                 new_info->forw = cpu_to_be32(old_blk->blkno);
                 new_info->back = old_info->back;
                 if (old_info->back) {
-                       error = xfs_da_read_buf(args->trans, args->dp,
+                       error = xfs_da_node_read(args->trans, args->dp,
                                                 be32_to_cpu(old_info->back),
                                                 -1, &bp, args->whichfork);
                         if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                 new_info->forw = old_info->forw;
                 new_info->back = cpu_to_be32(old_blk->blkno);
                 if (old_info->forw) {
-                       error = xfs_da_read_buf(args->trans, args->dp,
+                       error = xfs_da_node_read(args->trans, args->dp,
                                                 be32_to_cpu(old_info->forw),
                                                 -1, &bp, args->whichfork);
                         if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                 trace_xfs_da_unlink_back(args);
                 save_info->back = drop_info->back;
                 if (drop_info->back) {
-                       error = xfs_da_read_buf(args->trans, args->dp,
+                       error = xfs_da_node_read(args->trans, args->dp,
                                                 be32_to_cpu(drop_info->back),
                                                 -1, &bp, args->whichfork);
                         if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                 trace_xfs_da_unlink_forward(args);
                 save_info->forw = drop_info->forw;
                 if (drop_info->forw) {
-                       error = xfs_da_read_buf(args->trans, args->dp,
+                       error = xfs_da_node_read(args->trans, args->dp,
                                                 be32_to_cpu(drop_info->forw),
                                                 -1, &bp, args->whichfork);
                         if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
         xfs_dablk_t blkno=0;
         int level, error;
  
+       trace_xfs_da_path_shift(state->args);
+
         /*
          * Roll up the Btree looking for the first block where our
          * current index is not at the edge of the block.  Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
                  * Read the next child block.
                  */
                 blk->blkno = blkno;
-               error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
-                                                    &blk->bp, args->whichfork);
+               error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
+                                       &blk->bp, args->whichfork);
                 if (error)
                         return(error);
                 ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
          * Read the last block in the btree space.
          */
         last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
-       if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+       error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+       if (error)
                 return error;
         /*
          * Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
          * If the moved block has a left sibling, fix up the pointers.
          */
         if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-               if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+               error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+               if (error)
                         goto done;
                 sib_info = sib_buf->b_addr;
                 if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
          * If the moved block has a right sibling, fix up the pointers.
          */
         if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-               if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+               error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+               if (error)
                         goto done;
                 sib_info = sib_buf->b_addr;
                 if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
          * Walk down the tree looking for the parent of the moved block.
          */
         for (;;) {
-               if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+               error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+               if (error)
                         goto done;
                 par_node = par_buf->b_addr;
                 if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
                         error = XFS_ERROR(EFSCORRUPTED);
                         goto done;
                 }
-               if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+               error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+               if (error)
                         goto done;
                 par_node = par_buf->b_addr;
                 if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
         xfs_dablk_t             bno,
         xfs_daddr_t             mappedbno,
         struct xfs_buf          **bpp,
-       int                     whichfork)
+       int                     whichfork,
+       const struct xfs_buf_ops *ops)
  {
         struct xfs_buf          *bp;
         struct xfs_buf_map      map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
  
         error = xfs_trans_read_buf_map(dp->i_mount, trans,
                                         dp->i_mount->m_ddev_targp,
-                                       mapp, nmap, 0, &bp);
+                                       mapp, nmap, 0, &bp, ops);
         if (error)
                 goto out_free;
  
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
         struct xfs_trans        *trans,
         struct xfs_inode        *dp,
         xfs_dablk_t             bno,
-       int                     whichfork)
+       xfs_daddr_t             mappedbno,
+       int                     whichfork,
+       const struct xfs_buf_ops *ops)
  {
-       xfs_daddr_t             mappedbno = -1;
         struct xfs_buf_map      map;
         struct xfs_buf_map      *mapp;
         int                     nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
  
         mapp = &map;
         nmap = 1;
-       error = xfs_dabuf_map(trans, dp, bno, -1, whichfork,
+       error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
                                 &mapp, &nmap);
         if (error) {
                 /* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
         }
  
         mappedbno = mapp[0].bm_bn;
-       xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap);
+       xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
  
  out_free:
         if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h

index 132adafb041ecf2c6483047f63940fbccd118d15..ee5170c46ae1abb5d348b493f24843389b4ecbe1 100644 (file)
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
  #ifndef __XFS_DA_BTREE_H__
  #define        __XFS_DA_BTREE_H__
  
-struct xfs_buf;
  struct xfs_bmap_free;
  struct xfs_inode;
  struct xfs_mount;
@@ -214,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
   */
  int    xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                                        xfs_da_state_blk_t *new_blk);
+int    xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                        struct xfs_buf **bpp, int which_fork);
  
  /*
   * Utility routines.
@@ -226,9 +228,11 @@ int        xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
                               struct xfs_buf **bp, int whichfork);
  int    xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
                                xfs_dablk_t bno, xfs_daddr_t mappedbno,
-                              struct xfs_buf **bpp, int whichfork);
+                              struct xfs_buf **bpp, int whichfork,
+                              const struct xfs_buf_ops *ops);
  xfs_daddr_t    xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
-                       xfs_dablk_t bno, int whichfork);
+                               xfs_dablk_t bno, xfs_daddr_t mapped_bno,
+                               int whichfork, const struct xfs_buf_ops *ops);
  int    xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                                           struct xfs_buf *dead_buf);
  
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c

index b9b8646e62db7357ae882bae00f3a74036c3da81..d0e9c74d3d96a75c18d40f175efd51eec85aab18 100644 (file)
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
                 goto out_unlock;
         }
  
-       if (VN_CACHED(VFS_I(tip)) != 0) {
-               error = xfs_flushinval_pages(tip, 0, -1,
-                               FI_REMAPF_LOCKED);
-               if (error)
-                       goto out_unlock;
-       }
+       error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+       if (error)
+               goto out_unlock;
+       truncate_pagecache_range(VFS_I(ip), 0, -1);
  
         /* Verify O_DIRECT for ftmp */
         if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
          * are safe.  We don't really care if non-io related
          * fields change.
          */
-
-       xfs_tosspages(ip, 0, -1, FI_REMAPF);
+       truncate_pagecache_range(VFS_I(ip), 0, -1);
  
         tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
         if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c

index e93ca8f054f404245b2927b93cdf486387b432c1..7536faaa61e7852175f7b7c20f1e4b0c566c24db 100644 (file)
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
         xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
  }
  
+static void
+xfs_dir2_block_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+       int                     block_ok = 0;
+
+       block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+       block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+       if (!block_ok) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_dir2_block_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_block_verify(bp);
+}
+
+static void
+xfs_dir2_block_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_block_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
+       .verify_read = xfs_dir2_block_read_verify,
+       .verify_write = xfs_dir2_block_write_verify,
+};
+
+static int
+xfs_dir2_block_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       struct xfs_buf          **bpp)
+{
+       struct xfs_mount        *mp = dp->i_mount;
+
+       return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+                               XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+}
+
+static void
+xfs_dir2_block_need_space(
+       struct xfs_dir2_data_hdr        *hdr,
+       struct xfs_dir2_block_tail      *btp,
+       struct xfs_dir2_leaf_entry      *blp,
+       __be16                          **tagpp,
+       struct xfs_dir2_data_unused     **dupp,
+       struct xfs_dir2_data_unused     **enddupp,
+       int                             *compact,
+       int                             len)
+{
+       struct xfs_dir2_data_free       *bf;
+       __be16                          *tagp = NULL;
+       struct xfs_dir2_data_unused     *dup = NULL;
+       struct xfs_dir2_data_unused     *enddup = NULL;
+
+       *compact = 0;
+       bf = hdr->bestfree;
+
+       /*
+        * If there are stale entries we'll use one for the leaf.
+        */
+       if (btp->stale) {
+               if (be16_to_cpu(bf[0].length) >= len) {
+                       /*
+                        * The biggest entry enough to avoid compaction.
+                        */
+                       dup = (xfs_dir2_data_unused_t *)
+                             ((char *)hdr + be16_to_cpu(bf[0].offset));
+                       goto out;
+               }
+
+               /*
+                * Will need to compact to make this work.
+                * Tag just before the first leaf entry.
+                */
+               *compact = 1;
+               tagp = (__be16 *)blp - 1;
+
+               /* Data object just before the first leaf entry.  */
+               dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+               /*
+                * If it's not free then the data will go where the
+                * leaf data starts now, if it works at all.
+                */
+               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                       if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+                           (uint)sizeof(*blp) < len)
+                               dup = NULL;
+               } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+                       dup = NULL;
+               else
+                       dup = (xfs_dir2_data_unused_t *)blp;
+               goto out;
+       }
+
+       /*
+        * no stale entries, so just use free space.
+        * Tag just before the first leaf entry.
+        */
+       tagp = (__be16 *)blp - 1;
+
+       /* Data object just before the first leaf entry.  */
+       enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+       /*
+        * If it's not free then can't do this add without cleaning up:
+        * the space before the first leaf entry needs to be free so it
+        * can be expanded to hold the pointer to the new entry.
+        */
+       if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+               /*
+                * Check out the biggest freespace and see if it's the same one.
+                */
+               dup = (xfs_dir2_data_unused_t *)
+                     ((char *)hdr + be16_to_cpu(bf[0].offset));
+               if (dup != enddup) {
+                       /*
+                        * Not the same free entry, just check its length.
+                        */
+                       if (be16_to_cpu(dup->length) < len)
+                               dup = NULL;
+                       goto out;
+               }
+
+               /*
+                * It is the biggest freespace, can it hold the leaf too?
+                */
+               if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+                       /*
+                        * Yes, use the second-largest entry instead if it works.
+                        */
+                       if (be16_to_cpu(bf[1].length) >= len)
+                               dup = (xfs_dir2_data_unused_t *)
+                                     ((char *)hdr + be16_to_cpu(bf[1].offset));
+                       else
+                               dup = NULL;
+               }
+       }
+out:
+       *tagpp = tagp;
+       *dupp = dup;
+       *enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+       struct xfs_trans                *tp,
+       struct xfs_buf                  *bp,
+       struct xfs_dir2_data_hdr        *hdr,
+       struct xfs_dir2_block_tail      *btp,
+       struct xfs_dir2_leaf_entry      *blp,
+       int                             *needlog,
+       int                             *lfloghigh,
+       int                             *lfloglow)
+{
+       int                     fromidx;        /* source leaf index */
+       int                     toidx;          /* target leaf index */
+       int                     needscan = 0;
+       int                     highstale;      /* high stale index */
+
+       fromidx = toidx = be32_to_cpu(btp->count) - 1;
+       highstale = *lfloghigh = -1;
+       for (; fromidx >= 0; fromidx--) {
+               if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+                       if (highstale == -1)
+                               highstale = toidx;
+                       else {
+                               if (*lfloghigh == -1)
+                                       *lfloghigh = toidx;
+                               continue;
+                       }
+               }
+               if (fromidx < toidx)
+                       blp[toidx] = blp[fromidx];
+               toidx--;
+       }
+       *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+       *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+       be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+       xfs_dir2_data_make_free(tp, bp,
+               (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+               (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+               needlog, &needscan);
+       blp += be32_to_cpu(btp->stale) - 1;
+       btp->stale = cpu_to_be32(1);
+       /*
+        * If we now need to rebuild the bestfree map, do so.
+        * This needs to happen before the next call to use_free.
+        */
+       if (needscan)
+               xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+}
+
  /*
   * Add an entry to a block directory.
   */
@@ -63,7 +271,6 @@ int                                          /* error */
  xfs_dir2_block_addname(
         xfs_da_args_t           *args)          /* directory op arguments */
  {
-       xfs_dir2_data_free_t    *bf;            /* bestfree table in block */
         xfs_dir2_data_hdr_t     *hdr;           /* block header */
         xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
         struct xfs_buf          *bp;            /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
         dp = args->dp;
         tp = args->trans;
         mp = dp->i_mount;
-       /*
-        * Read the (one and only) directory block into dabuf bp.
-        */
-       if ((error =
-           xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+       /* Read the (one and only) directory block into bp. */
+       error = xfs_dir2_block_read(tp, dp, &bp);
+       if (error)
                 return error;
-       }
-       ASSERT(bp != NULL);
-       hdr = bp->b_addr;
-       /*
-        * Check the magic number, corrupted if wrong.
-        */
-       if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
-               XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
-                                    XFS_ERRLEVEL_LOW, mp, hdr);
-               xfs_trans_brelse(tp, bp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
+
         len = xfs_dir2_data_entsize(args->namelen);
+
         /*
          * Set up pointers to parts of the block.
          */
-       bf = hdr->bestfree;
+       hdr = bp->b_addr;
         btp = xfs_dir2_block_tail_p(mp, hdr);
         blp = xfs_dir2_block_leaf_p(btp);
+
         /*
-        * No stale entries?  Need space for entry and new leaf.
-        */
-       if (!btp->stale) {
-               /*
-                * Tag just before the first leaf entry.
-                */
-               tagp = (__be16 *)blp - 1;
-               /*
-                * Data object just before the first leaf entry.
-                */
-               enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-               /*
-                * If it's not free then can't do this add without cleaning up:
-                * the space before the first leaf entry needs to be free so it
-                * can be expanded to hold the pointer to the new entry.
-                */
-               if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-                       dup = enddup = NULL;
-               /*
-                * Check out the biggest freespace and see if it's the same one.
-                */
-               else {
-                       dup = (xfs_dir2_data_unused_t *)
-                             ((char *)hdr + be16_to_cpu(bf[0].offset));
-                       if (dup == enddup) {
-                               /*
-                                * It is the biggest freespace, is it too small
-                                * to hold the new leaf too?
-                                */
-                               if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
-                                       /*
-                                        * Yes, we use the second-largest
-                                        * entry instead if it works.
-                                        */
-                                       if (be16_to_cpu(bf[1].length) >= len)
-                                               dup = (xfs_dir2_data_unused_t *)
-                                                     ((char *)hdr +
-                                                      be16_to_cpu(bf[1].offset));
-                                       else
-                                               dup = NULL;
-                               }
-                       } else {
-                               /*
-                                * Not the same free entry,
-                                * just check its length.
-                                */
-                               if (be16_to_cpu(dup->length) < len) {
-                                       dup = NULL;
-                               }
-                       }
-               }
-               compact = 0;
-       }
-       /*
-        * If there are stale entries we'll use one for the leaf.
-        * Is the biggest entry enough to avoid compaction?
-        */
-       else if (be16_to_cpu(bf[0].length) >= len) {
-               dup = (xfs_dir2_data_unused_t *)
-                     ((char *)hdr + be16_to_cpu(bf[0].offset));
-               compact = 0;
-       }
-       /*
-        * Will need to compact to make this work.
+        * Find out if we can reuse stale entries or whether we need extra
+        * space for entry and new leaf.
          */
-       else {
-               /*
-                * Tag just before the first leaf entry.
-                */
-               tagp = (__be16 *)blp - 1;
-               /*
-                * Data object just before the first leaf entry.
-                */
-               dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-               /*
-                * If it's not free then the data will go where the
-                * leaf data starts now, if it works at all.
-                */
-               if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
-                           (uint)sizeof(*blp) < len)
-                               dup = NULL;
-               } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
-                       dup = NULL;
-               else
-                       dup = (xfs_dir2_data_unused_t *)blp;
-               compact = 1;
-       }
+       xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
+                                 &enddup, &compact, len);
+
         /*
-        * If this isn't a real add, we're done with the buffer.
+        * Done everything we need for a space check now.
          */
-       if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+       if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
                 xfs_trans_brelse(tp, bp);
+               if (!dup)
+                       return XFS_ERROR(ENOSPC);
+               return 0;
+       }
+
         /*
          * If we don't have space for the new entry & leaf ...
          */
         if (!dup) {
-               /*
-                * Not trying to actually do anything, or don't have
-                * a space reservation: return no-space.
-                */
-               if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+               /* Don't have a space reservation: return no-space.  */
+               if (args->total == 0)
                         return XFS_ERROR(ENOSPC);
                 /*
                  * Convert to the next larger format.
@@ -232,65 +349,24 @@ xfs_dir2_block_addname(
                         return error;
                 return xfs_dir2_leaf_addname(args);
         }
-       /*
-        * Just checking, and it would work, so say so.
-        */
-       if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-               return 0;
+
         needlog = needscan = 0;
+
         /*
          * If need to compact the leaf entries, do it now.
-        * Leave the highest-numbered stale entry stale.
-        * XXX should be the one closest to mid but mid is not yet computed.
-        */
-       if (compact) {
-               int     fromidx;                /* source leaf index */
-               int     toidx;                  /* target leaf index */
-
-               for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
-                       highstale = lfloghigh = -1;
-                    fromidx >= 0;
-                    fromidx--) {
-                       if (blp[fromidx].address ==
-                           cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-                               if (highstale == -1)
-                                       highstale = toidx;
-                               else {
-                                       if (lfloghigh == -1)
-                                               lfloghigh = toidx;
-                                       continue;
-                               }
-                       }
-                       if (fromidx < toidx)
-                               blp[toidx] = blp[fromidx];
-                       toidx--;
-               }
-               lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
-               lfloghigh -= be32_to_cpu(btp->stale) - 1;
-               be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
-               xfs_dir2_data_make_free(tp, bp,
-                       (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-                       (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
-                       &needlog, &needscan);
-               blp += be32_to_cpu(btp->stale) - 1;
-               btp->stale = cpu_to_be32(1);
-               /*
-                * If we now need to rebuild the bestfree map, do so.
-                * This needs to happen before the next call to use_free.
-                */
-               if (needscan) {
-                       xfs_dir2_data_freescan(mp, hdr, &needlog);
-                       needscan = 0;
-               }
-       }
-       /*
-        * Set leaf logging boundaries to impossible state.
-        * For the no-stale case they're set explicitly.
          */
+       if (compact)
+               xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+                                     &lfloghigh, &lfloglow);
         else if (btp->stale) {
+               /*
+                * Set leaf logging boundaries to impossible state.
+                * For the no-stale case they're set explicitly.
+                */
                 lfloglow = be32_to_cpu(btp->count);
                 lfloghigh = -1;
         }
+
         /*
          * Find the slot that's first lower than our hash value, -1 if none.
          */
@@ -450,18 +526,13 @@ xfs_dir2_block_getdents(
         /*
          * If the block number in the offset is out of range, we're done.
          */
-       if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+       if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
                 return 0;
-       }
-       /*
-        * Can't read the block, give up, else get dabuf in bp.
-        */
-       error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
-                               &bp, XFS_DATA_FORK);
+
+       error = xfs_dir2_block_read(NULL, dp, &bp);
         if (error)
                 return error;
  
-       ASSERT(bp != NULL);
         /*
          * Extract the byte offset we start at from the seek pointer.
          * We'll skip entries before this.
@@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int(
         dp = args->dp;
         tp = args->trans;
         mp = dp->i_mount;
-       /*
-        * Read the buffer, return error if we can't get it.
-        */
-       if ((error =
-           xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+       error = xfs_dir2_block_read(tp, dp, &bp);
+       if (error)
                 return error;
-       }
-       ASSERT(bp != NULL);
+
         hdr = bp->b_addr;
         xfs_dir2_data_check(dp, bp);
         btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block(
         /*
          * Read the data block if we don't already have it, give up if it fails.
          */
-       if (dbp == NULL &&
-           (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
-                   XFS_DATA_FORK))) {
-               return error;
+       if (!dbp) {
+               error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+               if (error)
+                       return error;
         }
         hdr = dbp->b_addr;
         ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block(
         /*
          * Start converting it to block form.
          */
+       dbp->b_ops = &xfs_dir2_block_buf_ops;
         hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
         needlog = 1;
         needscan = 0;
@@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block(
                 kmem_free(sfp);
                 return error;
         }
+       bp->b_ops = &xfs_dir2_block_buf_ops;
         hdr = bp->b_addr;
         hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
         /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c

index 44ffd4d6bc91af0f0f738bcf38f1371b2185d684..ffcf1774152eba2ebc510cf338514a0c0408af6a 100644 (file)
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
  STATIC xfs_dir2_data_free_t *
  xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
  
-#ifdef DEBUG
  /*
   * Check the consistency of the data block.
   * The input can also be a block-format directory.
- * Pop an assert if we find anything bad.
+ * Return 0 is the buffer is good, otherwise an error.
   */
-void
-xfs_dir2_data_check(
+int
+__xfs_dir2_data_check(
         struct xfs_inode        *dp,            /* incore inode pointer */
         struct xfs_buf          *bp)            /* data block's buffer */
  {
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
         int                     stale;          /* count of stale leaves */
         struct xfs_name         name;
  
-       mp = dp->i_mount;
+       mp = bp->b_target->bt_mount;
         hdr = bp->b_addr;
         bf = hdr->bestfree;
         p = (char *)(hdr + 1);
  
-       if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+       switch (hdr->magic) {
+       case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
                 btp = xfs_dir2_block_tail_p(mp, hdr);
                 lep = xfs_dir2_block_leaf_p(btp);
                 endp = (char *)lep;
-       } else {
-               ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+               break;
+       case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
                 endp = (char *)hdr + mp->m_dirblksize;
+               break;
+       default:
+               XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+               return EFSCORRUPTED;
         }
  
         count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
          * Account for zero bestfree entries.
          */
         if (!bf[0].length) {
-               ASSERT(!bf[0].offset);
+               XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
                 freeseen |= 1 << 0;
         }
         if (!bf[1].length) {
-               ASSERT(!bf[1].offset);
+               XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
                 freeseen |= 1 << 1;
         }
         if (!bf[2].length) {
-               ASSERT(!bf[2].offset);
+               XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
                 freeseen |= 1 << 2;
         }
-       ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
-       ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+
+       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+                                               be16_to_cpu(bf[1].length));
+       XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+                                               be16_to_cpu(bf[2].length));
         /*
          * Loop over the data/unused entries.
          */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
                  * doesn't need to be there.
                  */
                 if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                       ASSERT(lastfree == 0);
-                       ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
-                              (char *)dup - (char *)hdr);
+                       XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+                       XFS_WANT_CORRUPTED_RETURN(
+                               be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+                                              (char *)dup - (char *)hdr);
                         dfp = xfs_dir2_data_freefind(hdr, dup);
                         if (dfp) {
                                 i = (int)(dfp - bf);
-                               ASSERT((freeseen & (1 << i)) == 0);
+                               XFS_WANT_CORRUPTED_RETURN(
+                                       (freeseen & (1 << i)) == 0);
                                 freeseen |= 1 << i;
                         } else {
-                               ASSERT(be16_to_cpu(dup->length) <=
-                                      be16_to_cpu(bf[2].length));
+                               XFS_WANT_CORRUPTED_RETURN(
+                                       be16_to_cpu(dup->length) <=
+                                               be16_to_cpu(bf[2].length));
                         }
                         p += be16_to_cpu(dup->length);
                         lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
                  * The linear search is crude but this is DEBUG code.
                  */
                 dep = (xfs_dir2_data_entry_t *)p;
-               ASSERT(dep->namelen != 0);
-               ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
-               ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
-                      (char *)dep - (char *)hdr);
+               XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+               XFS_WANT_CORRUPTED_RETURN(
+                       !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+               XFS_WANT_CORRUPTED_RETURN(
+                       be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+                                              (char *)dep - (char *)hdr);
                 count++;
                 lastfree = 0;
                 if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
                                     be32_to_cpu(lep[i].hashval) == hash)
                                         break;
                         }
-                       ASSERT(i < be32_to_cpu(btp->count));
+                       XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
                 }
                 p += xfs_dir2_data_entsize(dep->namelen);
         }
         /*
          * Need to have seen all the entries and all the bestfree slots.
          */
-       ASSERT(freeseen == 7);
+       XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
         if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
                 for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
                         if (lep[i].address ==
                             cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
                                 stale++;
                         if (i > 0)
-                               ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+                               XFS_WANT_CORRUPTED_RETURN(
+                                       be32_to_cpu(lep[i].hashval) >=
+                                               be32_to_cpu(lep[i - 1].hashval));
                 }
-               ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
-               ASSERT(stale == be32_to_cpu(btp->stale));
+               XFS_WANT_CORRUPTED_RETURN(count ==
+                       be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+               XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
         }
+       return 0;
+}
+
+static void
+xfs_dir2_data_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+       int                     block_ok = 0;
+
+       block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+       block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+       if (!block_ok) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir2_data_reada_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+       switch (hdr->magic) {
+       case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+               bp->b_ops = &xfs_dir2_block_buf_ops;
+               bp->b_ops->verify_read(bp);
+               return;
+       case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+               xfs_dir2_data_verify(bp);
+               return;
+       default:
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+               break;
+       }
+}
+
+static void
+xfs_dir2_data_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_data_verify(bp);
+}
+
+static void
+xfs_dir2_data_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_data_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
+       .verify_read = xfs_dir2_data_read_verify,
+       .verify_write = xfs_dir2_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
+       .verify_read = xfs_dir2_data_reada_verify,
+       .verify_write = xfs_dir2_data_write_verify,
+};
+
+
+int
+xfs_dir2_data_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mapped_bno,
+       struct xfs_buf          **bpp)
+{
+       return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+                               XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+}
+
+int
+xfs_dir2_data_readahead(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             bno,
+       xfs_daddr_t             mapped_bno)
+{
+       return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+                               XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
  }
-#endif
  
  /*
   * Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
          */
         error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
                 XFS_DATA_FORK);
-       if (error) {
+       if (error)
                 return error;
-       }
-       ASSERT(bp != NULL);
+       bp->b_ops = &xfs_dir2_data_buf_ops;
  
         /*
          * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c

index 0b296253bd018d450f5773d6df3bdbcc03d55db5..60cd2fa4e047b3266b4abffa935663788b1d0388 100644 (file)
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
                                     int first, int last);
  static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
  
+static void
+xfs_dir2_leaf_verify(
+       struct xfs_buf          *bp,
+       __be16                  magic)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
+       int                     block_ok = 0;
+
+       block_ok = hdr->info.magic == magic;
+       if (!block_ok) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_dir2_leaf1_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+
+static void
+xfs_dir2_leaf1_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+
+void
+xfs_dir2_leafn_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+
+void
+xfs_dir2_leafn_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+
+static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
+       .verify_read = xfs_dir2_leaf1_read_verify,
+       .verify_write = xfs_dir2_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
+       .verify_read = xfs_dir2_leafn_read_verify,
+       .verify_write = xfs_dir2_leafn_write_verify,
+};
+
+static int
+xfs_dir2_leaf_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+}
+
+int
+xfs_dir2_leafn_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+}
  
  /*
   * Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
         /*
          * Fix up the block header, make it a data block.
          */
+       dbp->b_ops = &xfs_dir2_data_buf_ops;
         hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
         if (needscan)
                 xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
         dp = args->dp;
         tp = args->trans;
         mp = dp->i_mount;
-       /*
-        * Read the leaf block.
-        */
-       error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-               XFS_DATA_FORK);
-       if (error) {
+
+       error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+       if (error)
                 return error;
-       }
-       ASSERT(lbp != NULL);
+
         /*
          * Look up the entry by hash value and name.
          * We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
                 hdr = dbp->b_addr;
                 bestsp[use_block] = hdr->bestfree[0].length;
                 grown = 1;
-       }
-       /*
-        * Already had space in some data block.
-        * Just read that one in.
-        */
-       else {
-               if ((error =
-                   xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
-                           -1, &dbp, XFS_DATA_FORK))) {
+       } else {
+               /*
+                * Already had space in some data block.
+                * Just read that one in.
+                */
+               error = xfs_dir2_data_read(tp, dp,
+                                          xfs_dir2_db_to_da(mp, use_block),
+                                          -1, &dbp);
+               if (error) {
                         xfs_trans_brelse(tp, lbp);
                         return error;
                 }
                 hdr = dbp->b_addr;
                 grown = 0;
         }
-       xfs_dir2_data_check(dp, dbp);
         /*
          * Point to the biggest freespace in our data block.
          */
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
          * Read the directory block starting at the first mapping.
          */
         mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
-       error = xfs_da_read_buf(NULL, dp, map->br_startoff,
+       error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
                         map->br_blockcount >= mp->m_dirblkfsbs ?
-                           XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1,
-                       &bp, XFS_DATA_FORK);
+                           XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
  
         /*
          * Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
                  */
                 if (i > mip->ra_current &&
                     map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-                       xfs_buf_readahead(mp->m_ddev_targp,
+                       xfs_dir2_data_readahead(NULL, dp,
+                               map[mip->ra_index].br_startoff + mip->ra_offset,
                                 XFS_FSB_TO_DADDR(mp,
                                         map[mip->ra_index].br_startblock +
-                                                       mip->ra_offset),
-                               (int)BTOBB(mp->m_dirblksize));
+                                                       mip->ra_offset));
                         mip->ra_current = i;
                 }
  
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
                  * use our mapping, but this is a very rare case.
                  */
                 else if (i > mip->ra_current) {
-                       xfs_da_reada_buf(NULL, dp,
+                       xfs_dir2_data_readahead(NULL, dp,
                                         map[mip->ra_index].br_startoff +
-                                                       mip->ra_offset,
-                                       XFS_DATA_FORK);
+                                                       mip->ra_offset, -1);
                         mip->ra_current = i;
                 }
  
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
          * Get the buffer for the block.
          */
         error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
-               XFS_DATA_FORK);
-       if (error) {
+                              XFS_DATA_FORK);
+       if (error)
                 return error;
-       }
-       ASSERT(bp != NULL);
-       leaf = bp->b_addr;
+
         /*
          * Initialize the header.
          */
+       leaf = bp->b_addr;
         leaf->hdr.info.magic = cpu_to_be16(magic);
         leaf->hdr.info.forw = 0;
         leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
          * the block.
          */
         if (magic == XFS_DIR2_LEAF1_MAGIC) {
+               bp->b_ops = &xfs_dir2_leaf1_buf_ops;
                 ltp = xfs_dir2_leaf_tail_p(mp, leaf);
                 ltp->bestcount = 0;
                 xfs_dir2_leaf_log_tail(tp, bp);
-       }
+       } else
+               bp->b_ops = &xfs_dir2_leafn_buf_ops;
         *bpp = bp;
         return 0;
  }
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
         dp = args->dp;
         tp = args->trans;
         mp = dp->i_mount;
-       /*
-        * Read the leaf block into the buffer.
-        */
-       error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-                                                       XFS_DATA_FORK);
+
+       error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
         if (error)
                 return error;
+
         *lbpp = lbp;
         leaf = lbp->b_addr;
         xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
                 if (newdb != curdb) {
                         if (dbp)
                                 xfs_trans_brelse(tp, dbp);
-                       error = xfs_da_read_buf(tp, dp,
-                                               xfs_dir2_db_to_da(mp, newdb),
-                                               -1, &dbp, XFS_DATA_FORK);
+                       error = xfs_dir2_data_read(tp, dp,
+                                                  xfs_dir2_db_to_da(mp, newdb),
+                                                  -1, &dbp);
                         if (error) {
                                 xfs_trans_brelse(tp, lbp);
                                 return error;
                         }
-                       xfs_dir2_data_check(dp, dbp);
                         curdb = newdb;
                 }
                 /*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
                 ASSERT(cidb != -1);
                 if (cidb != curdb) {
                         xfs_trans_brelse(tp, dbp);
-                       error = xfs_da_read_buf(tp, dp,
-                                               xfs_dir2_db_to_da(mp, cidb),
-                                               -1, &dbp, XFS_DATA_FORK);
+                       error = xfs_dir2_data_read(tp, dp,
+                                                  xfs_dir2_db_to_da(mp, cidb),
+                                                  -1, &dbp);
                         if (error) {
                                 xfs_trans_brelse(tp, lbp);
                                 return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
         /*
          * Read the offending data block.  We need its buffer.
          */
-       if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
-                       XFS_DATA_FORK))) {
+       error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+       if (error)
                 return error;
-       }
  
         leaf = lbp->b_addr;
         ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
         /*
          * Read the freespace block.
          */
-       if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
-                       XFS_DATA_FORK))) {
+       error = xfs_dir2_free_read(tp, dp,  mp->m_dirfreeblk, &fbp);
+       if (error)
                 return error;
-       }
         free = fbp->b_addr;
         ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
         ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
                 xfs_dir2_leaf_compact(args, lbp);
         else
                 xfs_dir2_leaf_log_header(tp, lbp);
+
+       lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
         leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+
         /*
          * Set up the leaf tail from the freespace block.
          */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c

index 6c70524066051381f823d1043b857b5421a6490e..5980f9b7fa9b8c9bf69d2bf0b21408001e99e46a 100644 (file)
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
  static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
                                      xfs_da_state_blk_t *fblk);
  
+static void
+xfs_dir2_free_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+       int                     block_ok = 0;
+
+       block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+       if (!block_ok) {
+               XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
+                                    XFS_ERRLEVEL_LOW, mp, hdr);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_dir2_free_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_free_verify(bp);
+}
+
+static void
+xfs_dir2_free_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dir2_free_verify(bp);
+}
+
+static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
+       .verify_read = xfs_dir2_free_read_verify,
+       .verify_write = xfs_dir2_free_write_verify,
+};
+
+
+static int
+__xfs_dir2_free_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       xfs_daddr_t             mappedbno,
+       struct xfs_buf          **bpp)
+{
+       return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                               XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+}
+
+int
+xfs_dir2_free_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       struct xfs_buf          **bpp)
+{
+       return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+       struct xfs_trans        *tp,
+       struct xfs_inode        *dp,
+       xfs_dablk_t             fbno,
+       struct xfs_buf          **bpp)
+{
+       return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+}
+
  /*
   * Log entries from a freespace block.
   */
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
         /*
          * Get the buffer for the new freespace block.
          */
-       if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
-                       XFS_DATA_FORK))) {
+       error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+                               XFS_DATA_FORK);
+       if (error)
                 return error;
-       }
-       ASSERT(fbp != NULL);
+       fbp->b_ops = &xfs_dir2_free_buf_ops;
+
         free = fbp->b_addr;
         leaf = lbp->b_addr;
         ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
                 *to = cpu_to_be16(off);
         }
         free->hdr.nused = cpu_to_be32(n);
+
+       lbp->b_ops = &xfs_dir2_leafn_buf_ops;
         leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+
         /*
          * Log everything.
          */
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
                                  */
                                 if (curbp)
                                         xfs_trans_brelse(tp, curbp);
-                               /*
-                                * Read the free block.
-                                */
-                               error = xfs_da_read_buf(tp, dp,
+
+                               error = xfs_dir2_free_read(tp, dp,
                                                 xfs_dir2_db_to_da(mp, newfdb),
-                                               -1, &curbp, XFS_DATA_FORK);
+                                               &curbp);
                                 if (error)
                                         return error;
                                 free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
                                 ASSERT(state->extravalid);
                                 curbp = state->extrablk.bp;
                         } else {
-                               error = xfs_da_read_buf(tp, dp,
+                               error = xfs_dir2_data_read(tp, dp,
                                                 xfs_dir2_db_to_da(mp, newdb),
-                                               -1, &curbp, XFS_DATA_FORK);
+                                               -1, &curbp);
                                 if (error)
                                         return error;
                         }
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
                         state->extrablk.index = (int)((char *)dep -
                                                         (char *)curbp->b_addr);
                         state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                       curbp->b_ops = &xfs_dir2_data_buf_ops;
                         if (cmp == XFS_CMP_EXACT)
                                 return XFS_ERROR(EEXIST);
                 }
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
                         state->extrablk.index = -1;
                         state->extrablk.blkno = curdb;
                         state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                       curbp->b_ops = &xfs_dir2_data_buf_ops;
                 } else {
                         /* If the curbp is not the CI match block, drop it */
                         if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
         }
  }
  
+static int
+xfs_dir2_data_block_free(
+       xfs_da_args_t           *args,
+       struct xfs_dir2_data_hdr *hdr,
+       struct xfs_dir2_free    *free,
+       xfs_dir2_db_t           fdb,
+       int                     findex,
+       struct xfs_buf          *fbp,
+       int                     longest)
+{
+       struct xfs_trans        *tp = args->trans;
+       int                     logfree = 0;
+
+       if (!hdr) {
+               /* One less used entry in the free table.  */
+               be32_add_cpu(&free->hdr.nused, -1);
+               xfs_dir2_free_log_header(tp, fbp);
+
+               /*
+                * If this was the last entry in the table, we can trim the
+                * table size back.  There might be other entries at the end
+                * referring to non-existent data blocks, get those too.
+                */
+               if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+                       int     i;              /* free entry index */
+
+                       for (i = findex - 1; i >= 0; i--) {
+                               if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
+                                       break;
+                       }
+                       free->hdr.nvalid = cpu_to_be32(i + 1);
+                       logfree = 0;
+               } else {
+                       /* Not the last entry, just punch it out.  */
+                       free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+                       logfree = 1;
+               }
+               /*
+                * If there are no useful entries left in the block,
+                * get rid of the block if we can.
+                */
+               if (!free->hdr.nused) {
+                       int error;
+
+                       error = xfs_dir2_shrink_inode(args, fdb, fbp);
+                       if (error == 0) {
+                               fbp = NULL;
+                               logfree = 0;
+                       } else if (error != ENOSPC || args->total != 0)
+                               return error;
+                       /*
+                        * It's possible to get ENOSPC if there is no
+                        * space reservation.  In this case some one
+                        * else will eventually get rid of this block.
+                        */
+               }
+       } else {
+               /*
+                * Data block is not empty, just set the free entry to the new
+                * value.
+                */
+               free->bests[findex] = cpu_to_be16(longest);
+               logfree = 1;
+       }
+
+       /* Log the free entry that changed, unless we got rid of it.  */
+       if (logfree)
+               xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+       return 0;
+}
+
  /*
   * Remove an entry from a node directory.
   * This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
                 xfs_dir2_db_t   fdb;            /* freeblock block number */
                 int             findex;         /* index in freeblock entries */
                 xfs_dir2_free_t *free;          /* freeblock structure */
-               int             logfree;        /* need to log free entry */
  
                 /*
                  * Convert the data block number to a free block,
                  * read in the free block.
                  */
                 fdb = xfs_dir2_db_to_fdb(mp, db);
-               if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
-                               -1, &fbp, XFS_DATA_FORK))) {
+               error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+                                          &fbp);
+               if (error)
                         return error;
-               }
                 free = fbp->b_addr;
                 ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
                 ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
                  * If we got rid of the data block, we can eliminate that entry
                  * in the free block.
                  */
-               if (hdr == NULL) {
-                       /*
-                        * One less used entry in the free table.
-                        */
-                       be32_add_cpu(&free->hdr.nused, -1);
-                       xfs_dir2_free_log_header(tp, fbp);
-                       /*
-                        * If this was the last entry in the table, we can
-                        * trim the table size back.  There might be other
-                        * entries at the end referring to non-existent
-                        * data blocks, get those too.
-                        */
-                       if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
-                               int     i;              /* free entry index */
-
-                               for (i = findex - 1;
-                                    i >= 0 &&
-                                    free->bests[i] == cpu_to_be16(NULLDATAOFF);
-                                    i--)
-                                       continue;
-                               free->hdr.nvalid = cpu_to_be32(i + 1);
-                               logfree = 0;
-                       }
-                       /*
-                        * Not the last entry, just punch it out.
-                        */
-                       else {
-                               free->bests[findex] = cpu_to_be16(NULLDATAOFF);
-                               logfree = 1;
-                       }
-                       /*
-                        * If there are no useful entries left in the block,
-                        * get rid of the block if we can.
-                        */
-                       if (!free->hdr.nused) {
-                               error = xfs_dir2_shrink_inode(args, fdb, fbp);
-                               if (error == 0) {
-                                       fbp = NULL;
-                                       logfree = 0;
-                               } else if (error != ENOSPC || args->total != 0)
-                                       return error;
-                               /*
-                                * It's possible to get ENOSPC if there is no
-                                * space reservation.  In this case some one
-                                * else will eventually get rid of this block.
-                                */
-                       }
-               }
-               /*
-                * Data block is not empty, just set the free entry to
-                * the new value.
-                */
-               else {
-                       free->bests[findex] = cpu_to_be16(longest);
-                       logfree = 1;
-               }
-               /*
-                * Log the free entry that changed, unless we got rid of it.
-                */
-               if (logfree)
-                       xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+               error = xfs_dir2_data_block_free(args, hdr, free,
+                                                fdb, findex, fbp, longest);
+               if (error)
+                       return error;
         }
+
         xfs_dir2_leafn_check(dp, bp);
         /*
          * Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
                 /*
                  * Read the sibling leaf block.
                  */
-               if ((error =
-                   xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
-                           -1, &bp, XFS_DATA_FORK))) {
+               error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
+                                           blkno, -1, &bp);
+               if (error)
                         return error;
-               }
-               ASSERT(bp != NULL);
+
                 /*
                  * Count bytes in the two blocks combined.
                  */
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
                          * This should be really rare, so there's no reason
                          * to avoid it.
                          */
-                       if ((error = xfs_da_read_buf(tp, dp,
-                                       xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
-                                       XFS_DATA_FORK))) {
+                       error = xfs_dir2_free_try_read(tp, dp,
+                                               xfs_dir2_db_to_da(mp, fbno),
+                                               &fbp);
+                       if (error)
                                 return error;
-                       }
-                       if (unlikely(fbp == NULL)) {
+                       if (!fbp)
                                 continue;
-                       }
                         free = fbp->b_addr;
                         ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
                         findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
                  * that was just allocated.
                  */
                 fbno = xfs_dir2_db_to_fdb(mp, dbno);
-               if (unlikely(error = xfs_da_read_buf(tp, dp,
-                               xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
-                               XFS_DATA_FORK)))
+               error = xfs_dir2_free_try_read(tp, dp,
+                                              xfs_dir2_db_to_da(mp, fbno),
+                                              &fbp);
+               if (error)
                         return error;
  
                 /*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
                         /*
                          * Get a buffer for the new block.
                          */
-                       if ((error = xfs_da_get_buf(tp, dp,
-                                                  xfs_dir2_db_to_da(mp, fbno),
-                                                  -1, &fbp, XFS_DATA_FORK))) {
+                       error = xfs_da_get_buf(tp, dp,
+                                              xfs_dir2_db_to_da(mp, fbno),
+                                              -1, &fbp, XFS_DATA_FORK);
+                       if (error)
                                 return error;
-                       }
-                       ASSERT(fbp != NULL);
+                       fbp->b_ops = &xfs_dir2_free_buf_ops;
  
                         /*
                          * Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
                 /*
                  * Read the data block in.
                  */
-               error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
-                               -1, &dbp, XFS_DATA_FORK);
+               error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+                                          -1, &dbp);
                 if (error)
                         return error;
                 hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
         /*
          * Read the freespace block.
          */
-       if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
-                       XFS_DATA_FORK))) {
+       error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+       if (error)
                 return error;
-       }
-
         /*
          * There can be holes in freespace.  If fo is a hole, there's
          * nothing to do.
          */
-       if (bp == NULL) {
+       if (!bp)
                 return 0;
-       }
         free = bp->b_addr;
         ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
         /*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h

index 3523d3e15aa8ad3fd3f86aaf65f322ad13c956bf..7da79f6515fd08e94c036d6010cc9b8d1d68e1f8 100644 (file)
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
                                 const unsigned char *name, int len);
  
  /* xfs_dir2_block.c */
+extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
+
  extern int xfs_dir2_block_addname(struct xfs_da_args *args);
  extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
                 xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
  
  /* xfs_dir2_data.c */
  #ifdef DEBUG
-extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#define        xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
  #else
  #define        xfs_dir2_data_check(dp,bp)
  #endif
+
+extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+
+extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+               xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+               xfs_dablk_t bno, xfs_daddr_t mapped_bno);
+
  extern struct xfs_dir2_data_free *
  xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
                 struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
                 xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
  
  /* xfs_dir2_leaf.c */
+extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+
+extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+               xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
  extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
                 struct xfs_buf *dbp);
  extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
  extern int xfs_dir2_node_replace(struct xfs_da_args *args);
  extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
                 int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+               xfs_dablk_t fbno, struct xfs_buf **bpp);
  
  /* xfs_dir2_sf.c */
  extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c

index bf27fcca4843480376b097cbab1bc2e4f6c5e261..9e1bf5294c914a92dc12e2b31d9ab54644ae9a59 100644 (file)
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
         xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
  }
  
+static void
+xfs_dquot_buf_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+       struct xfs_disk_dquot   *ddq;
+       xfs_dqid_t              id = 0;
+       int                     i;
+
+       /*
+        * On the first read of the buffer, verify that each dquot is valid.
+        * We don't know what the id of the dquot is supposed to be, just that
+        * they should be increasing monotonically within the buffer. If the
+        * first id is corrupt, then it will fail on the second dquot in the
+        * buffer so corruptions could point to the wrong dquot in this case.
+        */
+       for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+               int     error;
+
+               ddq = &d[i].dd_diskdq;
+
+               if (i == 0)
+                       id = be32_to_cpu(ddq->d_id);
+
+               error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+                                       "xfs_dquot_read_verify");
+               if (error) {
+                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
+                       xfs_buf_ioerror(bp, EFSCORRUPTED);
+                       break;
+               }
+       }
+}
+
+static void
+xfs_dquot_buf_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dquot_buf_verify(bp);
+}
+
+void
+xfs_dquot_buf_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_dquot_buf_verify(bp);
+}
  
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+       .verify_read = xfs_dquot_buf_read_verify,
+       .verify_write = xfs_dquot_buf_write_verify,
+};
  
  /*
   * Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
         error = xfs_buf_geterror(bp);
         if (error)
                 goto error1;
+       bp->b_ops = &xfs_dquot_buf_ops;
  
         /*
          * Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
  
         return (error);
  }
+STATIC int
+xfs_qm_dqrepair(
+       struct xfs_mount        *mp,
+       struct xfs_trans        *tp,
+       struct xfs_dquot        *dqp,
+       xfs_dqid_t              firstid,
+       struct xfs_buf          **bpp)
+{
+       int                     error;
+       struct xfs_disk_dquot   *ddq;
+       struct xfs_dqblk        *d;
+       int                     i;
+
+       /*
+        * Read the buffer without verification so we get the corrupted
+        * buffer returned to us. make sure we verify it on write, though.
+        */
+       error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+                                  mp->m_quotainfo->qi_dqchunklen,
+                                  0, bpp, NULL);
+
+       if (error) {
+               ASSERT(*bpp == NULL);
+               return XFS_ERROR(error);
+       }
+       (*bpp)->b_ops = &xfs_dquot_buf_ops;
+
+       ASSERT(xfs_buf_islocked(*bpp));
+       d = (struct xfs_dqblk *)(*bpp)->b_addr;
+
+       /* Do the actual repair of dquots in this buffer */
+       for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+               ddq = &d[i].dd_diskdq;
+               error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+                                      dqp->dq_flags & XFS_DQ_ALLTYPES,
+                                      XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+               if (error) {
+                       /* repair failed, we're screwed */
+                       xfs_trans_brelse(tp, *bpp);
+                       return XFS_ERROR(EIO);
+               }
+       }
+
+       return 0;
+}
  
  /*
   * Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
         xfs_buf_t       *bp;
         xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
         xfs_mount_t     *mp = dqp->q_mount;
-       xfs_disk_dquot_t *ddq;
         xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
         xfs_trans_t     *tp = (tpp ? *tpp : NULL);
  
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
                 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                            dqp->q_blkno,
                                            mp->m_quotainfo->qi_dqchunklen,
-                                          0, &bp);
-               if (error || !bp)
-                       return XFS_ERROR(error);
-       }
-
-       ASSERT(xfs_buf_islocked(bp));
+                                          0, &bp, &xfs_dquot_buf_ops);
  
-       /*
-        * calculate the location of the dquot inside the buffer.
-        */
-       ddq = bp->b_addr + dqp->q_bufoffset;
+               if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+                       xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
+                                               mp->m_quotainfo->qi_dqperchunk;
+                       ASSERT(bp == NULL);
+                       error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+               }
  
-       /*
-        * A simple sanity check in case we got a corrupted dquot...
-        */
-       error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
-                          flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                          "dqtobp");
-       if (error) {
-               if (!(flags & XFS_QMOPT_DQREPAIR)) {
-                       xfs_trans_brelse(tp, bp);
-                       return XFS_ERROR(EIO);
+               if (error) {
+                       ASSERT(bp == NULL);
+                       return XFS_ERROR(error);
                 }
         }
  
+       ASSERT(xfs_buf_islocked(bp));
         *O_bpp = bp;
-       *O_ddpp = ddq;
+       *O_ddpp = bp->b_addr + dqp->q_bufoffset;
  
         return (0);
  }
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
          * Get the buffer containing the on-disk dquot
          */
         error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-                                  mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+                                  mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
         if (error)
                 goto out_unlock;
  
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h

index 7d20af27346dcae9d7015c02c7fd4d03fdb09948..c694a8469c4a0b6567fc67c64841a7b97ffd0779 100644 (file)
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
         return dqp;
  }
  
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+
  #endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c

index 8c6d1d70278cc6e71dbc093cbbfc5f2c5d44df19..a83611849cee292d6731d8a782778d6ebb2bf048 100644 (file)
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
  #include "xfs_inode.h"
  #include "xfs_inode_item.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  /*
   * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index aa473fa640a2dd16f56ae76abfb72c1f204ba43b..67284edb84d74d2156283c3c34e1b28d179b4861 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
  #include "xfs_error.h"
  #include "xfs_vnodeops.h"
  #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
  #include "xfs_ioctl.h"
  #include "xfs_trace.h"
  
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
   *     valid before the operation, it will be read from disk before
   *     being partially zeroed.
   */
-STATIC int
+int
  xfs_iozero(
         struct xfs_inode        *ip,    /* inode                        */
         loff_t                  pos,    /* offset in file               */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
                 xfs_buftarg_t   *target =
                         XFS_IS_REALTIME_INODE(ip) ?
                                 mp->m_rtdev_targp : mp->m_ddev_targp;
-               if ((iocb->ki_pos & target->bt_smask) ||
-                   (size & target->bt_smask)) {
-                       if (iocb->ki_pos == i_size_read(inode))
+               if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+                       if (pos == i_size_read(inode))
                                 return 0;
                         return -XFS_ERROR(EINVAL);
                 }
         }
  
-       n = mp->m_super->s_maxbytes - iocb->ki_pos;
+       n = mp->m_super->s_maxbytes - pos;
         if (n <= 0 || size == 0)
                 return 0;
  
@@ -289,20 +290,21 @@ xfs_file_aio_read(
                 xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
  
                 if (inode->i_mapping->nrpages) {
-                       ret = -xfs_flushinval_pages(ip,
-                                       (iocb->ki_pos & PAGE_CACHE_MASK),
-                                       -1, FI_REMAPF_LOCKED);
+                       ret = -filemap_write_and_wait_range(
+                                                       VFS_I(ip)->i_mapping,
+                                                       pos, -1);
                         if (ret) {
                                 xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                 return ret;
                         }
+                       truncate_pagecache_range(VFS_I(ip), pos, -1);
                 }
                 xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
         }
  
-       trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+       trace_xfs_file_read(ip, size, pos, ioflags);
  
-       ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+       ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
         if (ret > 0)
                 XFS_STATS_ADD(xs_read_bytes, ret);
  
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
                 goto out;
  
         if (mapping->nrpages) {
-               ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
-                                                       FI_REMAPF_LOCKED);
+               ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                                                   pos, -1);
                 if (ret)
                         goto out;
+               truncate_pagecache_range(VFS_I(ip), pos, -1);
         }
  
         /*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
  write_retry:
         trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
         ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-                       pos, &iocb->ki_pos, count, ret);
+                       pos, &iocb->ki_pos, count, 0);
+
         /*
-        * if we just got an ENOSPC, flush the inode now we aren't holding any
-        * page locks and retry *once*
+        * If we just got an ENOSPC, try to write back all dirty inodes to
+        * convert delalloc space to free up some of the excess reserved
+        * metadata space.
          */
         if (ret == -ENOSPC && !enospc) {
                 enospc = 1;
-               ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
-               if (!ret)
-                       goto write_retry;
+               xfs_flush_inodes(ip->i_mount);
+               goto write_retry;
         }
  
         current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
          */
         mode = xfs_ilock_map_shared(ip);
         if (ip->i_d.di_nextents > 0)
-               xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+               xfs_dir2_data_readahead(NULL, ip, 0, -1);
         xfs_iunlock(ip, mode);
         return 0;
  }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h

index c13fed8c394afb2b261b640cb662c33051741694..6dda3f949b04f5c8a9ad804d030c9b21f99b4d96 100644 (file)
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
  #define XFS_FSOP_GEOM_FLAGS_LOGV2      0x0100  /* log format version 2 */
  #define XFS_FSOP_GEOM_FLAGS_SECTOR     0x0200  /* sector sizes >1BB    */
  #define XFS_FSOP_GEOM_FLAGS_ATTR2      0x0400  /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI    0x1000  /* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32   0x0800  /* 32-bit project IDs   */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI    0x1000  /* ASCII only CI names  */
  #define XFS_FSOP_GEOM_FLAGS_LAZYSB     0x4000  /* lazy superblock counters */
  
  
@@ -338,6 +339,35 @@ typedef struct xfs_error_injection {
  } xfs_error_injection_t;
  
  
+/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION          1
+struct xfs_eofblocks {
+       __u32           eof_version;
+       __u32           eof_flags;
+       uid_t           eof_uid;
+       gid_t           eof_gid;
+       prid_t          eof_prid;
+       __u32           pad32;
+       __u64           eof_min_file_size;
+       __u64           pad64[12];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC             (1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID              (1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID              (1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID             (1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE      (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_VALID    \
+       (XFS_EOF_FLAGS_SYNC |   \
+        XFS_EOF_FLAGS_UID |    \
+        XFS_EOF_FLAGS_GID |    \
+        XFS_EOF_FLAGS_PRID |   \
+        XFS_EOF_FLAGS_MINFILESIZE)
+
+
  /*
   * The user-level Handle Request interface structure.
   */
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
  /*     XFS_IOC_GETBIOSIZE ---- deprecated 47      */
  #define XFS_IOC_GETBMAPX       _IOWR('X', 56, struct getbmap)
  #define XFS_IOC_ZERO_RANGE     _IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
  
  /*
   * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c

deleted file mode 100644 (file)

index 652b875..0000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trace.h"
-
-/*
- * note: all filemap functions return negative error codes. These
- * need to be inverted before returning to the xfs core functions.
- */
-void
-xfs_tosspages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last,
-       int             fiopt)
-{
-       /* can't toss partial tail pages, so mask them out */
-       last &= ~(PAGE_SIZE - 1);
-       truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-}
-
-int
-xfs_flushinval_pages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last,
-       int             fiopt)
-{
-       struct address_space *mapping = VFS_I(ip)->i_mapping;
-       int             ret = 0;
-
-       trace_xfs_pagecache_inval(ip, first, last);
-
-       xfs_iflags_clear(ip, XFS_ITRUNCATED);
-       ret = filemap_write_and_wait_range(mapping, first,
-                               last == -1 ? LLONG_MAX : last);
-       if (!ret)
-               truncate_inode_pages_range(mapping, first, last);
-       return -ret;
-}
-
-int
-xfs_flush_pages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last,
-       uint64_t        flags,
-       int             fiopt)
-{
-       struct address_space *mapping = VFS_I(ip)->i_mapping;
-       int             ret = 0;
-       int             ret2;
-
-       xfs_iflags_clear(ip, XFS_ITRUNCATED);
-       ret = -filemap_fdatawrite_range(mapping, first,
-                               last == -1 ? LLONG_MAX : last);
-       if (flags & XBF_ASYNC)
-               return ret;
-       ret2 = xfs_wait_on_pages(ip, first, last);
-       if (!ret)
-               ret = ret2;
-       return ret;
-}
-
-int
-xfs_wait_on_pages(
-       xfs_inode_t     *ip,
-       xfs_off_t       first,
-       xfs_off_t       last)
-{
-       struct address_space *mapping = VFS_I(ip)->i_mapping;
-
-       if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-               return -filemap_fdatawait_range(mapping, first,
-                                       last == -1 ? XFS_ISIZE(ip) - 1 : last);
-       }
-       return 0;
-}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c

index 4beaede43277b15c8d5a28ffbaadb4b573d7a709..94eaeedc54980b396147954dfee4ae335576a318 100644 (file)
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
                         (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
                                 XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
                         (xfs_sb_version_hasattr2(&mp->m_sb) ?
-                               XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+                               XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+                       (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+                               XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
                 geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                 mp->m_sb.sb_logsectsize : BBSIZE;
                 geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
         return 0;
  }
  
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+       struct xfs_mount        *mp,
+       xfs_daddr_t             blkno,
+       size_t                  numblks,
+       int                     flags,
+       const struct xfs_buf_ops *ops)
+{
+       struct xfs_buf          *bp;
+
+       bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+       if (!bp)
+               return NULL;
+
+       xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+       bp->b_bn = blkno;
+       bp->b_maps[0].bm_bn = blkno;
+       bp->b_ops = ops;
+
+       return bp;
+}
+
  static int
  xfs_growfs_data_private(
         xfs_mount_t             *mp,            /* mount point for filesystem */
         xfs_growfs_data_t       *in)            /* growfs data input struct */
  {
         xfs_agf_t               *agf;
+       struct xfs_agfl         *agfl;
         xfs_agi_t               *agi;
         xfs_agnumber_t          agno;
         xfs_extlen_t            agsize;
         xfs_extlen_t            tmpsize;
         xfs_alloc_rec_t         *arec;
-       struct xfs_btree_block  *block;
         xfs_buf_t               *bp;
         int                     bucket;
         int                     dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
         dpct = pct - mp->m_sb.sb_imax_pct;
         bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                 XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                               XFS_FSS_TO_BB(mp, 1), 0);
+                               XFS_FSS_TO_BB(mp, 1), 0, NULL);
         if (!bp)
                 return EIO;
+       if (bp->b_error) {
+               int     error = bp->b_error;
+               xfs_buf_relse(bp);
+               return error;
+       }
         xfs_buf_relse(bp);
  
         new = nb;       /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
         nfree = 0;
         for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
                 /*
-                * AG freelist header block
+                * AG freespace header block
                  */
-               bp = xfs_buf_get(mp->m_ddev_targp,
-                                XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                                XFS_FSS_TO_BB(mp, 1), 0);
+               bp = xfs_growfs_get_hdr_buf(mp,
+                               XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                               XFS_FSS_TO_BB(mp, 1), 0,
+                               &xfs_agf_buf_ops);
                 if (!bp) {
                         error = ENOMEM;
                         goto error0;
                 }
+
                 agf = XFS_BUF_TO_AGF(bp);
-               memset(agf, 0, mp->m_sb.sb_sectsize);
                 agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
                 agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
                 agf->agf_seqno = cpu_to_be32(agno);
@@ -222,18 +252,40 @@ xfs_growfs_data_private(
                 if (error)
                         goto error0;
  
+               /*
+                * AG freelist header block
+                */
+               bp = xfs_growfs_get_hdr_buf(mp,
+                               XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+                               XFS_FSS_TO_BB(mp, 1), 0,
+                               &xfs_agfl_buf_ops);
+               if (!bp) {
+                       error = ENOMEM;
+                       goto error0;
+               }
+
+               agfl = XFS_BUF_TO_AGFL(bp);
+               for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+                       agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+
+               error = xfs_bwrite(bp);
+               xfs_buf_relse(bp);
+               if (error)
+                       goto error0;
+
                 /*
                  * AG inode header block
                  */
-               bp = xfs_buf_get(mp->m_ddev_targp,
-                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                XFS_FSS_TO_BB(mp, 1), 0);
+               bp = xfs_growfs_get_hdr_buf(mp,
+                               XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                               XFS_FSS_TO_BB(mp, 1), 0,
+                               &xfs_agi_buf_ops);
                 if (!bp) {
                         error = ENOMEM;
                         goto error0;
                 }
+
                 agi = XFS_BUF_TO_AGI(bp);
-               memset(agi, 0, mp->m_sb.sb_sectsize);
                 agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
                 agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
                 agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
                 /*
                  * BNO btree root block
                  */
-               bp = xfs_buf_get(mp->m_ddev_targp,
-                                XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-                                BTOBB(mp->m_sb.sb_blocksize), 0);
+               bp = xfs_growfs_get_hdr_buf(mp,
+                               XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+                               BTOBB(mp->m_sb.sb_blocksize), 0,
+                               &xfs_allocbt_buf_ops);
+
                 if (!bp) {
                         error = ENOMEM;
                         goto error0;
                 }
-               block = XFS_BUF_TO_BLOCK(bp);
-               memset(block, 0, mp->m_sb.sb_blocksize);
-               block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
-               block->bb_level = 0;
-               block->bb_numrecs = cpu_to_be16(1);
-               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-               arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+
+               xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
+               arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
                 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                 arec->ar_blockcount = cpu_to_be32(
                         agsize - be32_to_cpu(arec->ar_startblock));
+
                 error = xfs_bwrite(bp);
                 xfs_buf_relse(bp);
                 if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
                 /*
                  * CNT btree root block
                  */
-               bp = xfs_buf_get(mp->m_ddev_targp,
-                                XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-                                BTOBB(mp->m_sb.sb_blocksize), 0);
+               bp = xfs_growfs_get_hdr_buf(mp,
+                               XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+                               BTOBB(mp->m_sb.sb_blocksize), 0,
+                               &xfs_allocbt_buf_ops);
                 if (!bp) {
                         error = ENOMEM;
                         goto error0;
                 }
-               block = XFS_BUF_TO_BLOCK(bp);
-               memset(block, 0, mp->m_sb.sb_blocksize);
-               block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
-               block->bb_level = 0;
-               block->bb_numrecs = cpu_to_be16(1);
-               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-               arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+
+               xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
+               arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
                 arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                 arec->ar_blockcount = cpu_to_be32(
                         agsize - be32_to_cpu(arec->ar_startblock));
                 nfree += be32_to_cpu(arec->ar_blockcount);
+
                 error = xfs_bwrite(bp);
                 xfs_buf_relse(bp);
                 if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
                 /*
                  * INO btree root block
                  */
-               bp = xfs_buf_get(mp->m_ddev_targp,
-                                XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-                                BTOBB(mp->m_sb.sb_blocksize), 0);
+               bp = xfs_growfs_get_hdr_buf(mp,
+                               XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+                               BTOBB(mp->m_sb.sb_blocksize), 0,
+                               &xfs_inobt_buf_ops);
                 if (!bp) {
                         error = ENOMEM;
                         goto error0;
                 }
-               block = XFS_BUF_TO_BLOCK(bp);
-               memset(block, 0, mp->m_sb.sb_blocksize);
-               block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
-               block->bb_level = 0;
-               block->bb_numrecs = 0;
-               block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-               block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+
+               xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
+
                 error = xfs_bwrite(bp);
                 xfs_buf_relse(bp);
                 if (error)
@@ -408,14 +452,16 @@ xfs_growfs_data_private(
                 if (agno < oagcount) {
                         error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                                   XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-                                 XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                 XFS_FSS_TO_BB(mp, 1), 0, &bp,
+                                 &xfs_sb_buf_ops);
                 } else {
                         bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
                                   XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                                   XFS_FSS_TO_BB(mp, 1), 0);
-                       if (bp)
+                       if (bp) {
+                               bp->b_ops = &xfs_sb_buf_ops;
                                 xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-                       else
+                       } else
                                 error = ENOMEM;
                 }
  
@@ -426,6 +472,7 @@ xfs_growfs_data_private(
                         break;
                 }
                 xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+
                 /*
                  * If we get an error writing out the alternate superblocks,
                  * just issue a warning and continue.  The real work is
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c

index 76e81cff70b9a71b2f3a9a5c3d2487a437992fa1..5399ef222dd738b85d39096ac7db5a39ebb8c359 100644 (file)
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
  /*
   * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
   * other XFS code uses these values.  Times are measured in centisecs (i.e.
- * 100ths of a second).
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
   */
  xfs_param_t xfs_params = {
                           /*    MIN             DFLT            MAX     */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
         .rotorstep      = {     1,              1,              255     },
         .inherit_nodfrg = {     0,              1,              1       },
         .fstrm_timer    = {     1,              30*100,         3600*100},
+       .eofb_timer     = {     1,              300,            3600*24},
  };
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c

index c5c4ef4f2bdbec16d9eb07ff0188921f677384ff..a815412eab805e69ba4bbc670095267651010180 100644 (file)
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
                  */
                 d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
                 fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-                                        mp->m_bsize * blks_per_cluster, 0);
+                                        mp->m_bsize * blks_per_cluster,
+                                        XBF_UNMAPPED);
                 if (!fbuf)
                         return ENOMEM;
                 /*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
                  *      to log a whole cluster of inodes instead of all the
                  *      individual transactions causing a lot of log traffic.
                  */
+               fbuf->b_ops = &xfs_inode_buf_ops;
                 xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
                 for (i = 0; i < ninodes; i++) {
                         int     ioffset = i << mp->m_sb.sb_inodelog;
@@ -877,9 +879,9 @@ error0:
   * This function is designed to be called twice if it has to do an allocation
   * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
   * If an inode is available without having to performn an allocation, an inode
- * number is returned.  In this case, *IO_agbp would be NULL.  If an allocation
- * needes to be done, xfs_dialloc would return the current AGI buffer in
- * *IO_agbp.  The caller should then commit the current transaction, allocate a
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
   * new transaction, and call xfs_dialloc() again, passing in the previous value
   * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
   * buffer is locked across the two calls, the second call is guaranteed to have
@@ -1472,6 +1474,57 @@ xfs_check_agi_unlinked(
  #define xfs_check_agi_unlinked(agi)
  #endif
  
+static void
+xfs_agi_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
+       int             agi_ok;
+
+       /*
+        * Validate the magic number of the agi block.
+        */
+       agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
+               XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+
+       /*
+        * during growfs operations, the perag is not fully initialised,
+        * so we can't use it for any useful checking. growfs ensures we can't
+        * use it by using uncached buffers that don't have the perag attached
+        * so we can detect and avoid this problem.
+        */
+       if (bp->b_pag)
+               agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
+                                               bp->b_pag->pag_agno;
+
+       if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+                       XFS_RANDOM_IALLOC_READ_AGI))) {
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+       xfs_check_agi_unlinked(agi);
+}
+
+static void
+xfs_agi_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_agi_verify(bp);
+}
+
+static void
+xfs_agi_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_agi_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+       .verify_read = xfs_agi_read_verify,
+       .verify_write = xfs_agi_write_verify,
+};
+
  /*
   * Read in the allocation group header (inode allocation section)
   */
@@ -1482,38 +1535,18 @@ xfs_read_agi(
         xfs_agnumber_t          agno,   /* allocation group number */
         struct xfs_buf          **bpp)  /* allocation group hdr buf */
  {
-       struct xfs_agi          *agi;   /* allocation group header */
-       int                     agi_ok; /* agi is consistent */
         int                     error;
  
         ASSERT(agno != NULLAGNUMBER);
  
         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                         XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                       XFS_FSS_TO_BB(mp, 1), 0, bpp);
+                       XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
         if (error)
                 return error;
  
         ASSERT(!xfs_buf_geterror(*bpp));
-       agi = XFS_BUF_TO_AGI(*bpp);
-
-       /*
-        * Validate the magic number of the agi block.
-        */
-       agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
-               XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
-               be32_to_cpu(agi->agi_seqno) == agno;
-       if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
-                       XFS_RANDOM_IALLOC_READ_AGI))) {
-               XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
-                                    mp, agi);
-               xfs_trans_brelse(tp, *bpp);
-               return XFS_ERROR(EFSCORRUPTED);
-       }
-
         xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-
-       xfs_check_agi_unlinked(agi);
         return 0;
  }
  
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h

index 1fd6ea4e9c91cdb09af5c80ff293ad818894a2d1..c8da3df271e6b94c97bc76d627ad1c1052ad03b7 100644 (file)
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
  /*
   * Get the data from the pointed-to record.
   */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
                 xfs_inobt_rec_incore_t *rec, int *stat);
  
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
  #endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c

index 2b8b7a37aa185dace60a254403eaa22c8a3b1daa..bec344b365079fc51c9a2057ccb9ae7982ab6ec1 100644 (file)
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
  #include "xfs_ialloc.h"
  #include "xfs_alloc.h"
  #include "xfs_error.h"
+#include "xfs_trace.h"
  
  
  STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
                           cur->bc_rec.i.ir_startino;
  }
  
+void
+xfs_inobt_verify(
+       struct xfs_buf          *bp)
+{
+       struct xfs_mount        *mp = bp->b_target->bt_mount;
+       struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+       unsigned int            level;
+       int                     sblock_ok; /* block passes checks */
+
+       /* magic number and level verification */
+       level = be16_to_cpu(block->bb_level);
+       sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
+                   level < mp->m_in_maxlevels;
+
+       /* numrecs verification */
+       sblock_ok = sblock_ok &&
+               be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+
+       /* sibling pointer verification */
+       sblock_ok = sblock_ok &&
+               (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+               block->bb_u.s.bb_leftsib &&
+               (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+               block->bb_u.s.bb_rightsib;
+
+       if (!sblock_ok) {
+               trace_xfs_btree_corrupt(bp, _RET_IP_);
+               XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+               xfs_buf_ioerror(bp, EFSCORRUPTED);
+       }
+}
+
+static void
+xfs_inobt_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inobt_verify(bp);
+}
+
+static void
+xfs_inobt_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inobt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+       .verify_read = xfs_inobt_read_verify,
+       .verify_write = xfs_inobt_write_verify,
+};
+
  #ifdef DEBUG
  STATIC int
  xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
         .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
         .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
         .key_diff               = xfs_inobt_key_diff,
+       .buf_ops                = &xfs_inobt_buf_ops,
  #ifdef DEBUG
         .keys_inorder           = xfs_inobt_keys_inorder,
         .recs_inorder           = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h

index f782ad0c4769483ae23ab25dade35ee737ea4c63..25c0239a8eab78f7ba60be5ee02c4d85cd1649aa 100644 (file)
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
                 struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
  extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
  
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+
  #endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c

new file mode 100644 (file)

index 0000000..96e344e
--- /dev/null
+++ b/fs/xfs/xfs_icache.c
@@ -0,0 +1,1341 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_log.h"
+#include "xfs_log_priv.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_inode_item.h"
+#include "xfs_quota.h"
+#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_icache.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+                               struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+       struct xfs_mount        *mp,
+       xfs_ino_t               ino)
+{
+       struct xfs_inode        *ip;
+
+       /*
+        * if this didn't occur in transactions, we could use
+        * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+        * code up to do this anyway.
+        */
+       ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+       if (!ip)
+               return NULL;
+       if (inode_init_always(mp->m_super, VFS_I(ip))) {
+               kmem_zone_free(xfs_inode_zone, ip);
+               return NULL;
+       }
+
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!spin_is_locked(&ip->i_flags_lock));
+       ASSERT(!xfs_isiflocked(ip));
+       ASSERT(ip->i_ino == 0);
+
+       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+       /* initialise the xfs inode */
+       ip->i_ino = ino;
+       ip->i_mount = mp;
+       memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+       ip->i_afp = NULL;
+       memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+       ip->i_flags = 0;
+       ip->i_delayed_blks = 0;
+       memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+       return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+       struct rcu_head         *head)
+{
+       struct inode            *inode = container_of(head, struct inode, i_rcu);
+       struct xfs_inode        *ip = XFS_I(inode);
+
+       kmem_zone_free(xfs_inode_zone, ip);
+}
+
+STATIC void
+xfs_inode_free(
+       struct xfs_inode        *ip)
+{
+       switch (ip->i_d.di_mode & S_IFMT) {
+       case S_IFREG:
+       case S_IFDIR:
+       case S_IFLNK:
+               xfs_idestroy_fork(ip, XFS_DATA_FORK);
+               break;
+       }
+
+       if (ip->i_afp)
+               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+       if (ip->i_itemp) {
+               ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+               xfs_inode_item_destroy(ip);
+               ip->i_itemp = NULL;
+       }
+
+       /* asserts to verify all state is correct here */
+       ASSERT(atomic_read(&ip->i_pincount) == 0);
+       ASSERT(!spin_is_locked(&ip->i_flags_lock));
+       ASSERT(!xfs_isiflocked(ip));
+
+       /*
+        * Because we use RCU freeing we need to ensure the inode always
+        * appears to be reclaimed with an invalid inode number when in the
+        * free state. The ip->i_flags_lock provides the barrier against lookup
+        * races.
+        */
+       spin_lock(&ip->i_flags_lock);
+       ip->i_flags = XFS_IRECLAIM;
+       ip->i_ino = 0;
+       spin_unlock(&ip->i_flags_lock);
+
+       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+       struct xfs_perag        *pag,
+       struct xfs_inode        *ip,
+       xfs_ino_t               ino,
+       int                     flags,
+       int                     lock_flags) __releases(RCU)
+{
+       struct inode            *inode = VFS_I(ip);
+       struct xfs_mount        *mp = ip->i_mount;
+       int                     error;
+
+       /*
+        * check for re-use of an inode within an RCU grace period due to the
+        * radix tree nodes not being updated yet. We monitor for this by
+        * setting the inode number to zero before freeing the inode structure.
+        * If the inode has been reallocated and set up, then the inode number
+        * will not match, so check for that, too.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (ip->i_ino != ino) {
+               trace_xfs_iget_skip(ip);
+               XFS_STATS_INC(xs_ig_frecycle);
+               error = EAGAIN;
+               goto out_error;
+       }
+
+
+       /*
+        * If we are racing with another cache hit that is currently
+        * instantiating this inode or currently recycling it out of
+        * reclaimabe state, wait for the initialisation to complete
+        * before continuing.
+        *
+        * XXX(hch): eventually we should do something equivalent to
+        *           wait_on_inode to wait for these flags to be cleared
+        *           instead of polling for it.
+        */
+       if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+               trace_xfs_iget_skip(ip);
+               XFS_STATS_INC(xs_ig_frecycle);
+               error = EAGAIN;
+               goto out_error;
+       }
+
+       /*
+        * If lookup is racing with unlink return an error immediately.
+        */
+       if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+               error = ENOENT;
+               goto out_error;
+       }
+
+       /*
+        * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+        * Need to carefully get it back into useable state.
+        */
+       if (ip->i_flags & XFS_IRECLAIMABLE) {
+               trace_xfs_iget_reclaim(ip);
+
+               /*
+                * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+                * from stomping over us while we recycle the inode.  We can't
+                * clear the radix tree reclaimable tag yet as it requires
+                * pag_ici_lock to be held exclusive.
+                */
+               ip->i_flags |= XFS_IRECLAIM;
+
+               spin_unlock(&ip->i_flags_lock);
+               rcu_read_unlock();
+
+               error = -inode_init_always(mp->m_super, inode);
+               if (error) {
+                       /*
+                        * Re-initializing the inode failed, and we are in deep
+                        * trouble.  Try to re-add it to the reclaim list.
+                        */
+                       rcu_read_lock();
+                       spin_lock(&ip->i_flags_lock);
+
+                       ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+                       ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+                       trace_xfs_iget_reclaim_fail(ip);
+                       goto out_error;
+               }
+
+               spin_lock(&pag->pag_ici_lock);
+               spin_lock(&ip->i_flags_lock);
+
+               /*
+                * Clear the per-lifetime state in the inode as we are now
+                * effectively a new inode and need to return to the initial
+                * state before reuse occurs.
+                */
+               ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+               ip->i_flags |= XFS_INEW;
+               __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+               inode->i_state = I_NEW;
+
+               ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+               mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+               spin_unlock(&ip->i_flags_lock);
+               spin_unlock(&pag->pag_ici_lock);
+       } else {
+               /* If the VFS inode is being torn down, pause and try again. */
+               if (!igrab(inode)) {
+                       trace_xfs_iget_skip(ip);
+                       error = EAGAIN;
+                       goto out_error;
+               }
+
+               /* We've got a live one. */
+               spin_unlock(&ip->i_flags_lock);
+               rcu_read_unlock();
+               trace_xfs_iget_hit(ip);
+       }
+
+       if (lock_flags != 0)
+               xfs_ilock(ip, lock_flags);
+
+       xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+       XFS_STATS_INC(xs_ig_found);
+
+       return 0;
+
+out_error:
+       spin_unlock(&ip->i_flags_lock);
+       rcu_read_unlock();
+       return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       xfs_trans_t             *tp,
+       xfs_ino_t               ino,
+       struct xfs_inode        **ipp,
+       int                     flags,
+       int                     lock_flags)
+{
+       struct xfs_inode        *ip;
+       int                     error;
+       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
+       int                     iflags;
+
+       ip = xfs_inode_alloc(mp, ino);
+       if (!ip)
+               return ENOMEM;
+
+       error = xfs_iread(mp, tp, ip, flags);
+       if (error)
+               goto out_destroy;
+
+       trace_xfs_iget_miss(ip);
+
+       if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+               error = ENOENT;
+               goto out_destroy;
+       }
+
+       /*
+        * Preload the radix tree so we can insert safely under the
+        * write spinlock. Note that we cannot sleep inside the preload
+        * region. Since we can be called from transaction context, don't
+        * recurse into the file system.
+        */
+       if (radix_tree_preload(GFP_NOFS)) {
+               error = EAGAIN;
+               goto out_destroy;
+       }
+
+       /*
+        * Because the inode hasn't been added to the radix-tree yet it can't
+        * be found by another thread, so we can do the non-sleeping lock here.
+        */
+       if (lock_flags) {
+               if (!xfs_ilock_nowait(ip, lock_flags))
+                       BUG();
+       }
+
+       /*
+        * These values must be set before inserting the inode into the radix
+        * tree as the moment it is inserted a concurrent lookup (allowed by the
+        * RCU locking mechanism) can find it and that lookup must see that this
+        * is an inode currently under construction (i.e. that XFS_INEW is set).
+        * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+        * memory barrier that ensures this detection works correctly at lookup
+        * time.
+        */
+       iflags = XFS_INEW;
+       if (flags & XFS_IGET_DONTCACHE)
+               iflags |= XFS_IDONTCACHE;
+       ip->i_udquot = ip->i_gdquot = NULL;
+       xfs_iflags_set(ip, iflags);
+
+       /* insert the new inode */
+       spin_lock(&pag->pag_ici_lock);
+       error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+       if (unlikely(error)) {
+               WARN_ON(error != -EEXIST);
+               XFS_STATS_INC(xs_ig_dup);
+               error = EAGAIN;
+               goto out_preload_end;
+       }
+       spin_unlock(&pag->pag_ici_lock);
+       radix_tree_preload_end();
+
+       *ipp = ip;
+       return 0;
+
+out_preload_end:
+       spin_unlock(&pag->pag_ici_lock);
+       radix_tree_preload_end();
+       if (lock_flags)
+               xfs_iunlock(ip, lock_flags);
+out_destroy:
+       __destroy_inode(VFS_I(ip));
+       xfs_inode_free(ip);
+       return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *              for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+       xfs_mount_t     *mp,
+       xfs_trans_t     *tp,
+       xfs_ino_t       ino,
+       uint            flags,
+       uint            lock_flags,
+       xfs_inode_t     **ipp)
+{
+       xfs_inode_t     *ip;
+       int             error;
+       xfs_perag_t     *pag;
+       xfs_agino_t     agino;
+
+       /*
+        * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+        * doesn't get freed while it's being referenced during a
+        * radix tree traversal here.  It assumes this function
+        * aqcuires only the ILOCK (and therefore it has no need to
+        * involve the IOLOCK in this synchronization).
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+       /* reject inode numbers outside existing AGs */
+       if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+               return EINVAL;
+
+       /* get the perag structure and ensure that it's inode capable */
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+       agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+       error = 0;
+       rcu_read_lock();
+       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+       if (ip) {
+               error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+               if (error)
+                       goto out_error_or_again;
+       } else {
+               rcu_read_unlock();
+               XFS_STATS_INC(xs_ig_missed);
+
+               error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+                                                       flags, lock_flags);
+               if (error)
+                       goto out_error_or_again;
+       }
+       xfs_perag_put(pag);
+
+       *ipp = ip;
+
+       /*
+        * If we have a real type for an on-disk inode, we can set ops(&unlock)
+        * now.  If it's a new inode being created, xfs_ialloc will handle it.
+        */
+       if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+               xfs_setup_inode(ip);
+       return 0;
+
+out_error_or_again:
+       if (error == EAGAIN) {
+               delay(1);
+               goto again;
+       }
+       xfs_perag_put(pag);
+       return error;
+}
+
+/*
+ * The inode lookup is done in batches to keep the amount of lock traffic and
+ * radix tree lookups to a minimum. The batch size is a trade off between
+ * lookup reduction and stack usage. This is in the reclaim path, so we can't
+ * be too greedy.
+ */
+#define XFS_LOOKUP_BATCH       32
+
+STATIC int
+xfs_inode_ag_walk_grab(
+       struct xfs_inode        *ip)
+{
+       struct inode            *inode = VFS_I(ip);
+
+       ASSERT(rcu_read_lock_held());
+
+       /*
+        * check for stale RCU freed inode
+        *
+        * If the inode has been reallocated, it doesn't matter if it's not in
+        * the AG we are walking - we are walking for writeback, so if it
+        * passes all the "valid inode" checks and is dirty, then we'll write
+        * it back anyway.  If it has been reallocated and still being
+        * initialised, the XFS_INEW check below will catch it.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!ip->i_ino)
+               goto out_unlock_noent;
+
+       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
+       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
+               goto out_unlock_noent;
+       spin_unlock(&ip->i_flags_lock);
+
+       /* nothing to sync during shutdown */
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+               return EFSCORRUPTED;
+
+       /* If we can't grab the inode, it must on it's way to reclaim. */
+       if (!igrab(inode))
+               return ENOENT;
+
+       if (is_bad_inode(inode)) {
+               IRELE(ip);
+               return ENOENT;
+       }
+
+       /* inode is valid */
+       return 0;
+
+out_unlock_noent:
+       spin_unlock(&ip->i_flags_lock);
+       return ENOENT;
+}
+
+STATIC int
+xfs_inode_ag_walk(
+       struct xfs_mount        *mp,
+       struct xfs_perag        *pag,
+       int                     (*execute)(struct xfs_inode *ip,
+                                          struct xfs_perag *pag, int flags,
+                                          void *args),
+       int                     flags,
+       void                    *args,
+       int                     tag)
+{
+       uint32_t                first_index;
+       int                     last_error = 0;
+       int                     skipped;
+       int                     done;
+       int                     nr_found;
+
+restart:
+       done = 0;
+       skipped = 0;
+       first_index = 0;
+       nr_found = 0;
+       do {
+               struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+               int             error = 0;
+               int             i;
+
+               rcu_read_lock();
+
+               if (tag == -1)
+                       nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                                       (void **)batch, first_index,
+                                       XFS_LOOKUP_BATCH);
+               else
+                       nr_found = radix_tree_gang_lookup_tag(
+                                       &pag->pag_ici_root,
+                                       (void **) batch, first_index,
+                                       XFS_LOOKUP_BATCH, tag);
+
+               if (!nr_found) {
+                       rcu_read_unlock();
+                       break;
+               }
+
+               /*
+                * Grab the inodes before we drop the lock. if we found
+                * nothing, nr == 0 and the loop will be skipped.
+                */
+               for (i = 0; i < nr_found; i++) {
+                       struct xfs_inode *ip = batch[i];
+
+                       if (done || xfs_inode_ag_walk_grab(ip))
+                               batch[i] = NULL;
+
+                       /*
+                        * Update the index for the next lookup. Catch
+                        * overflows into the next AG range which can occur if
+                        * we have inodes in the last block of the AG and we
+                        * are currently pointing to the last inode.
+                        *
+                        * Because we may see inodes that are from the wrong AG
+                        * due to RCU freeing and reallocation, only update the
+                        * index if it lies in this AG. It was a race that lead
+                        * us to see this inode, so another lookup from the
+                        * same index will not find it again.
+                        */
+                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
+                               continue;
+                       first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                       if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                               done = 1;
+               }
+
+               /* unlock now we've grabbed the inodes. */
+               rcu_read_unlock();
+
+               for (i = 0; i < nr_found; i++) {
+                       if (!batch[i])
+                               continue;
+                       error = execute(batch[i], pag, flags, args);
+                       IRELE(batch[i]);
+                       if (error == EAGAIN) {
+                               skipped++;
+                               continue;
+                       }
+                       if (error && last_error != EFSCORRUPTED)
+                               last_error = error;
+               }
+
+               /* bail out if the filesystem is corrupted.  */
+               if (error == EFSCORRUPTED)
+                       break;
+
+               cond_resched();
+
+       } while (nr_found && !done);
+
+       if (skipped) {
+               delay(1);
+               goto restart;
+       }
+       return last_error;
+}
+
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+       struct xfs_mount *mp)
+{
+       rcu_read_lock();
+       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+               queue_delayed_work(mp->m_eofblocks_workqueue,
+                                  &mp->m_eofblocks_work,
+                                  msecs_to_jiffies(xfs_eofb_secs * 1000));
+       rcu_read_unlock();
+}
+
+void
+xfs_eofblocks_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                               struct xfs_mount, m_eofblocks_work);
+       xfs_icache_free_eofblocks(mp, NULL);
+       xfs_queue_eofblocks(mp);
+}
+
+int
+xfs_inode_ag_iterator(
+       struct xfs_mount        *mp,
+       int                     (*execute)(struct xfs_inode *ip,
+                                          struct xfs_perag *pag, int flags,
+                                          void *args),
+       int                     flags,
+       void                    *args)
+{
+       struct xfs_perag        *pag;
+       int                     error = 0;
+       int                     last_error = 0;
+       xfs_agnumber_t          ag;
+
+       ag = 0;
+       while ((pag = xfs_perag_get(mp, ag))) {
+               ag = pag->pag_agno + 1;
+               error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
+               xfs_perag_put(pag);
+               if (error) {
+                       last_error = error;
+                       if (error == EFSCORRUPTED)
+                               break;
+               }
+       }
+       return XFS_ERROR(last_error);
+}
+
+int
+xfs_inode_ag_iterator_tag(
+       struct xfs_mount        *mp,
+       int                     (*execute)(struct xfs_inode *ip,
+                                          struct xfs_perag *pag, int flags,
+                                          void *args),
+       int                     flags,
+       void                    *args,
+       int                     tag)
+{
+       struct xfs_perag        *pag;
+       int                     error = 0;
+       int                     last_error = 0;
+       xfs_agnumber_t          ag;
+
+       ag = 0;
+       while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
+               ag = pag->pag_agno + 1;
+               error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+               xfs_perag_put(pag);
+               if (error) {
+                       last_error = error;
+                       if (error == EFSCORRUPTED)
+                               break;
+               }
+       }
+       return XFS_ERROR(last_error);
+}
+
+/*
+ * Queue a new inode reclaim pass if there are reclaimable inodes and there
+ * isn't a reclaim pass already in progress. By default it runs every 5s based
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
+ * tunable, but that can be done if this method proves to be ineffective or too
+ * aggressive.
+ */
+static void
+xfs_reclaim_work_queue(
+       struct xfs_mount        *mp)
+{
+
+       rcu_read_lock();
+       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
+               queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
+                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * This is a fast pass over the inode cache to try to get reclaim moving on as
+ * many inodes as possible in a short period of time. It kicks itself every few
+ * seconds, as well as being kicked by the inode cache shrinker when memory
+ * goes low. It scans as quickly as possible avoiding locked inodes or those
+ * already being flushed, and once done schedules a future pass.
+ */
+void
+xfs_reclaim_worker(
+       struct work_struct *work)
+{
+       struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                       struct xfs_mount, m_reclaim_work);
+
+       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
+       xfs_reclaim_work_queue(mp);
+}
+
+static void
+__xfs_inode_set_reclaim_tag(
+       struct xfs_perag        *pag,
+       struct xfs_inode        *ip)
+{
+       radix_tree_tag_set(&pag->pag_ici_root,
+                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                          XFS_ICI_RECLAIM_TAG);
+
+       if (!pag->pag_ici_reclaimable) {
+               /* propagate the reclaim tag up into the perag radix tree */
+               spin_lock(&ip->i_mount->m_perag_lock);
+               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                               XFS_ICI_RECLAIM_TAG);
+               spin_unlock(&ip->i_mount->m_perag_lock);
+
+               /* schedule periodic background inode reclaim */
+               xfs_reclaim_work_queue(ip->i_mount);
+
+               trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
+                                                       -1, _RET_IP_);
+       }
+       pag->pag_ici_reclaimable++;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+       xfs_inode_t     *ip)
+{
+       struct xfs_mount *mp = ip->i_mount;
+       struct xfs_perag *pag;
+
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+       spin_lock(&pag->pag_ici_lock);
+       spin_lock(&ip->i_flags_lock);
+       __xfs_inode_set_reclaim_tag(pag, ip);
+       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+       spin_unlock(&ip->i_flags_lock);
+       spin_unlock(&pag->pag_ici_lock);
+       xfs_perag_put(pag);
+}
+
+STATIC void
+__xfs_inode_clear_reclaim(
+       xfs_perag_t     *pag,
+       xfs_inode_t     *ip)
+{
+       pag->pag_ici_reclaimable--;
+       if (!pag->pag_ici_reclaimable) {
+               /* clear the reclaim tag from the perag radix tree */
+               spin_lock(&ip->i_mount->m_perag_lock);
+               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                               XFS_ICI_RECLAIM_TAG);
+               spin_unlock(&ip->i_mount->m_perag_lock);
+               trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
+                                                       -1, _RET_IP_);
+       }
+}
+
+STATIC void
+__xfs_inode_clear_reclaim_tag(
+       xfs_mount_t     *mp,
+       xfs_perag_t     *pag,
+       xfs_inode_t     *ip)
+{
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+       __xfs_inode_clear_reclaim(pag, ip);
+}
+
+/*
+ * Grab the inode for reclaim exclusively.
+ * Return 0 if we grabbed it, non-zero otherwise.
+ */
+STATIC int
+xfs_reclaim_inode_grab(
+       struct xfs_inode        *ip,
+       int                     flags)
+{
+       ASSERT(rcu_read_lock_held());
+
+       /* quick check for stale RCU freed inode */
+       if (!ip->i_ino)
+               return 1;
+
+       /*
+        * If we are asked for non-blocking operation, do unlocked checks to
+        * see if the inode already is being flushed or in reclaim to avoid
+        * lock traffic.
+        */
+       if ((flags & SYNC_TRYLOCK) &&
+           __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
+               return 1;
+
+       /*
+        * The radix tree lock here protects a thread in xfs_iget from racing
+        * with us starting reclaim on the inode.  Once we have the
+        * XFS_IRECLAIM flag set it will not touch us.
+        *
+        * Due to RCU lookup, we may find inodes that have been freed and only
+        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
+        * aren't candidates for reclaim at all, so we must check the
+        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
+        */
+       spin_lock(&ip->i_flags_lock);
+       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
+           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
+               /* not a reclaim candidate. */
+               spin_unlock(&ip->i_flags_lock);
+               return 1;
+       }
+       __xfs_iflags_set(ip, XFS_IRECLAIM);
+       spin_unlock(&ip->i_flags_lock);
+       return 0;
+}
+
+/*
+ * Inodes in different states need to be treated differently. The following
+ * table lists the inode states and the reclaim actions necessary:
+ *
+ *     inode state          iflush ret         required action
+ *      ---------------      ----------         ---------------
+ *     bad                     -               reclaim
+ *     shutdown                EIO             unpin and reclaim
+ *     clean, unpinned         0               reclaim
+ *     stale, unpinned         0               reclaim
+ *     clean, pinned(*)        0               requeue
+ *     stale, pinned           EAGAIN          requeue
+ *     dirty, async            -               requeue
+ *     dirty, sync             0               reclaim
+ *
+ * (*) dgc: I don't think the clean, pinned state is possible but it gets
+ * handled anyway given the order of checks implemented.
+ *
+ * Also, because we get the flush lock first, we know that any inode that has
+ * been flushed delwri has had the flush completed by the time we check that
+ * the inode is clean.
+ *
+ * Note that because the inode is flushed delayed write by AIL pushing, the
+ * flush lock may already be held here and waiting on it can result in very
+ * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
+ * the caller should push the AIL first before trying to reclaim inodes to
+ * minimise the amount of time spent waiting.  For background relaim, we only
+ * bother to reclaim clean inodes anyway.
+ *
+ * Hence the order of actions after gaining the locks should be:
+ *     bad             => reclaim
+ *     shutdown        => unpin and reclaim
+ *     pinned, async   => requeue
+ *     pinned, sync    => unpin
+ *     stale           => reclaim
+ *     clean           => reclaim
+ *     dirty, async    => requeue
+ *     dirty, sync     => flush, wait and reclaim
+ */
+STATIC int
+xfs_reclaim_inode(
+       struct xfs_inode        *ip,
+       struct xfs_perag        *pag,
+       int                     sync_mode)
+{
+       struct xfs_buf          *bp = NULL;
+       int                     error;
+
+restart:
+       error = 0;
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       if (!xfs_iflock_nowait(ip)) {
+               if (!(sync_mode & SYNC_WAIT))
+                       goto out;
+               xfs_iflock(ip);
+       }
+
+       if (is_bad_inode(VFS_I(ip)))
+               goto reclaim;
+       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+               xfs_iunpin_wait(ip);
+               xfs_iflush_abort(ip, false);
+               goto reclaim;
+       }
+       if (xfs_ipincount(ip)) {
+               if (!(sync_mode & SYNC_WAIT))
+                       goto out_ifunlock;
+               xfs_iunpin_wait(ip);
+       }
+       if (xfs_iflags_test(ip, XFS_ISTALE))
+               goto reclaim;
+       if (xfs_inode_clean(ip))
+               goto reclaim;
+
+       /*
+        * Never flush out dirty data during non-blocking reclaim, as it would
+        * just contend with AIL pushing trying to do the same job.
+        */
+       if (!(sync_mode & SYNC_WAIT))
+               goto out_ifunlock;
+
+       /*
+        * Now we have an inode that needs flushing.
+        *
+        * Note that xfs_iflush will never block on the inode buffer lock, as
+        * xfs_ifree_cluster() can lock the inode buffer before it locks the
+        * ip->i_lock, and we are doing the exact opposite here.  As a result,
+        * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
+        * result in an ABBA deadlock with xfs_ifree_cluster().
+        *
+        * As xfs_ifree_cluser() must gather all inodes that are active in the
+        * cache to mark them stale, if we hit this case we don't actually want
+        * to do IO here - we want the inode marked stale so we can simply
+        * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
+        * inode, back off and try again.  Hopefully the next pass through will
+        * see the stale flag set on the inode.
+        */
+       error = xfs_iflush(ip, &bp);
+       if (error == EAGAIN) {
+               xfs_iunlock(ip, XFS_ILOCK_EXCL);
+               /* backoff longer than in xfs_ifree_cluster */
+               delay(2);
+               goto restart;
+       }
+
+       if (!error) {
+               error = xfs_bwrite(bp);
+               xfs_buf_relse(bp);
+       }
+
+       xfs_iflock(ip);
+reclaim:
+       xfs_ifunlock(ip);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       XFS_STATS_INC(xs_ig_reclaims);
+       /*
+        * Remove the inode from the per-AG radix tree.
+        *
+        * Because radix_tree_delete won't complain even if the item was never
+        * added to the tree assert that it's been there before to catch
+        * problems with the inode life time early on.
+        */
+       spin_lock(&pag->pag_ici_lock);
+       if (!radix_tree_delete(&pag->pag_ici_root,
+                               XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
+               ASSERT(0);
+       __xfs_inode_clear_reclaim(pag, ip);
+       spin_unlock(&pag->pag_ici_lock);
+
+       /*
+        * Here we do an (almost) spurious inode lock in order to coordinate
+        * with inode cache radix tree lookups.  This is because the lookup
+        * can reference the inodes in the cache without taking references.
+        *
+        * We make that OK here by ensuring that we wait until the inode is
+        * unlocked after the lookup before we go ahead and free it.
+        */
+       xfs_ilock(ip, XFS_ILOCK_EXCL);
+       xfs_qm_dqdetach(ip);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+       xfs_inode_free(ip);
+       return error;
+
+out_ifunlock:
+       xfs_ifunlock(ip);
+out:
+       xfs_iflags_clear(ip, XFS_IRECLAIM);
+       xfs_iunlock(ip, XFS_ILOCK_EXCL);
+       /*
+        * We could return EAGAIN here to make reclaim rescan the inode tree in
+        * a short while. However, this just burns CPU time scanning the tree
+        * waiting for IO to complete and the reclaim work never goes back to
+        * the idle state. Instead, return 0 to let the next scheduled
+        * background reclaim attempt to reclaim the inode again.
+        */
+       return 0;
+}
+
+/*
+ * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
+ * corrupted, we still want to try to reclaim all the inodes. If we don't,
+ * then a shut down during filesystem unmount reclaim walk leak all the
+ * unreclaimed inodes.
+ */
+STATIC int
+xfs_reclaim_inodes_ag(
+       struct xfs_mount        *mp,
+       int                     flags,
+       int                     *nr_to_scan)
+{
+       struct xfs_perag        *pag;
+       int                     error = 0;
+       int                     last_error = 0;
+       xfs_agnumber_t          ag;
+       int                     trylock = flags & SYNC_TRYLOCK;
+       int                     skipped;
+
+restart:
+       ag = 0;
+       skipped = 0;
+       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+               unsigned long   first_index = 0;
+               int             done = 0;
+               int             nr_found = 0;
+
+               ag = pag->pag_agno + 1;
+
+               if (trylock) {
+                       if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
+                               skipped++;
+                               xfs_perag_put(pag);
+                               continue;
+                       }
+                       first_index = pag->pag_ici_reclaim_cursor;
+               } else
+                       mutex_lock(&pag->pag_ici_reclaim_lock);
+
+               do {
+                       struct xfs_inode *batch[XFS_LOOKUP_BATCH];
+                       int     i;
+
+                       rcu_read_lock();
+                       nr_found = radix_tree_gang_lookup_tag(
+                                       &pag->pag_ici_root,
+                                       (void **)batch, first_index,
+                                       XFS_LOOKUP_BATCH,
+                                       XFS_ICI_RECLAIM_TAG);
+                       if (!nr_found) {
+                               done = 1;
+                               rcu_read_unlock();
+                               break;
+                       }
+
+                       /*
+                        * Grab the inodes before we drop the lock. if we found
+                        * nothing, nr == 0 and the loop will be skipped.
+                        */
+                       for (i = 0; i < nr_found; i++) {
+                               struct xfs_inode *ip = batch[i];
+
+                               if (done || xfs_reclaim_inode_grab(ip, flags))
+                                       batch[i] = NULL;
+
+                               /*
+                                * Update the index for the next lookup. Catch
+                                * overflows into the next AG range which can
+                                * occur if we have inodes in the last block of
+                                * the AG and we are currently pointing to the
+                                * last inode.
+                                *
+                                * Because we may see inodes that are from the
+                                * wrong AG due to RCU freeing and
+                                * reallocation, only update the index if it
+                                * lies in this AG. It was a race that lead us
+                                * to see this inode, so another lookup from
+                                * the same index will not find it again.
+                                */
+                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
+                                                               pag->pag_agno)
+                                       continue;
+                               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+                               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
+                                       done = 1;
+                       }
+
+                       /* unlock now we've grabbed the inodes. */
+                       rcu_read_unlock();
+
+                       for (i = 0; i < nr_found; i++) {
+                               if (!batch[i])
+                                       continue;
+                               error = xfs_reclaim_inode(batch[i], pag, flags);
+                               if (error && last_error != EFSCORRUPTED)
+                                       last_error = error;
+                       }
+
+                       *nr_to_scan -= XFS_LOOKUP_BATCH;
+
+                       cond_resched();
+
+               } while (nr_found && !done && *nr_to_scan > 0);
+
+               if (trylock && !done)
+                       pag->pag_ici_reclaim_cursor = first_index;
+               else
+                       pag->pag_ici_reclaim_cursor = 0;
+               mutex_unlock(&pag->pag_ici_reclaim_lock);
+               xfs_perag_put(pag);
+       }
+
+       /*
+        * if we skipped any AG, and we still have scan count remaining, do
+        * another pass this time using blocking reclaim semantics (i.e
+        * waiting on the reclaim locks and ignoring the reclaim cursors). This
+        * ensure that when we get more reclaimers than AGs we block rather
+        * than spin trying to execute reclaim.
+        */
+       if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
+               trylock = 0;
+               goto restart;
+       }
+       return XFS_ERROR(last_error);
+}
+
+int
+xfs_reclaim_inodes(
+       xfs_mount_t     *mp,
+       int             mode)
+{
+       int             nr_to_scan = INT_MAX;
+
+       return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
+}
+
+/*
+ * Scan a certain number of inodes for reclaim.
+ *
+ * When called we make sure that there is a background (fast) inode reclaim in
+ * progress, while we will throttle the speed of reclaim via doing synchronous
+ * reclaim of inodes. That means if we come across dirty inodes, we wait for
+ * them to be cleaned, which we hope will not be very long due to the
+ * background walker having already kicked the IO off on those dirty inodes.
+ */
+void
+xfs_reclaim_inodes_nr(
+       struct xfs_mount        *mp,
+       int                     nr_to_scan)
+{
+       /* kick background reclaimer and push the AIL */
+       xfs_reclaim_work_queue(mp);
+       xfs_ail_push_all(mp->m_ail);
+
+       xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+}
+
+/*
+ * Return the number of reclaimable inodes in the filesystem for
+ * the shrinker to determine how much to reclaim.
+ */
+int
+xfs_reclaim_inodes_count(
+       struct xfs_mount        *mp)
+{
+       struct xfs_perag        *pag;
+       xfs_agnumber_t          ag = 0;
+       int                     reclaimable = 0;
+
+       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
+               ag = pag->pag_agno + 1;
+               reclaimable += pag->pag_ici_reclaimable;
+               xfs_perag_put(pag);
+       }
+       return reclaimable;
+}
+
+STATIC int
+xfs_inode_match_id(
+       struct xfs_inode        *ip,
+       struct xfs_eofblocks    *eofb)
+{
+       if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
+           ip->i_d.di_uid != eofb->eof_uid)
+               return 0;
+
+       if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
+           ip->i_d.di_gid != eofb->eof_gid)
+               return 0;
+
+       if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+           xfs_get_projid(ip) != eofb->eof_prid)
+               return 0;
+
+       return 1;
+}
+
+STATIC int
+xfs_inode_free_eofblocks(
+       struct xfs_inode        *ip,
+       struct xfs_perag        *pag,
+       int                     flags,
+       void                    *args)
+{
+       int ret;
+       struct xfs_eofblocks *eofb = args;
+
+       if (!xfs_can_free_eofblocks(ip, false)) {
+               /* inode could be preallocated or append-only */
+               trace_xfs_inode_free_eofblocks_invalid(ip);
+               xfs_inode_clear_eofblocks_tag(ip);
+               return 0;
+       }
+
+       /*
+        * If the mapping is dirty the operation can block and wait for some
+        * time. Unless we are waiting, skip it.
+        */
+       if (!(flags & SYNC_WAIT) &&
+           mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+               return 0;
+
+       if (eofb) {
+               if (!xfs_inode_match_id(ip, eofb))
+                       return 0;
+
+               /* skip the inode if the file size is too small */
+               if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+                   XFS_ISIZE(ip) < eofb->eof_min_file_size)
+                       return 0;
+       }
+
+       ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+
+       /* don't revisit the inode if we're not waiting */
+       if (ret == EAGAIN && !(flags & SYNC_WAIT))
+               ret = 0;
+
+       return ret;
+}
+
+int
+xfs_icache_free_eofblocks(
+       struct xfs_mount        *mp,
+       struct xfs_eofblocks    *eofb)
+{
+       int flags = SYNC_TRYLOCK;
+
+       if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+               flags = SYNC_WAIT;
+
+       return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+                                        eofb, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+void
+xfs_inode_set_eofblocks_tag(
+       xfs_inode_t     *ip)
+{
+       struct xfs_mount *mp = ip->i_mount;
+       struct xfs_perag *pag;
+       int tagged;
+
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+       spin_lock(&pag->pag_ici_lock);
+       trace_xfs_inode_set_eofblocks_tag(ip);
+
+       tagged = radix_tree_tagged(&pag->pag_ici_root,
+                                  XFS_ICI_EOFBLOCKS_TAG);
+       radix_tree_tag_set(&pag->pag_ici_root,
+                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                          XFS_ICI_EOFBLOCKS_TAG);
+       if (!tagged) {
+               /* propagate the eofblocks tag up into the perag radix tree */
+               spin_lock(&ip->i_mount->m_perag_lock);
+               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                                  XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                  XFS_ICI_EOFBLOCKS_TAG);
+               spin_unlock(&ip->i_mount->m_perag_lock);
+
+               /* kick off background trimming */
+               xfs_queue_eofblocks(ip->i_mount);
+
+               trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+                                             -1, _RET_IP_);
+       }
+
+       spin_unlock(&pag->pag_ici_lock);
+       xfs_perag_put(pag);
+}
+
+void
+xfs_inode_clear_eofblocks_tag(
+       xfs_inode_t     *ip)
+{
+       struct xfs_mount *mp = ip->i_mount;
+       struct xfs_perag *pag;
+
+       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+       spin_lock(&pag->pag_ici_lock);
+       trace_xfs_inode_clear_eofblocks_tag(ip);
+
+       radix_tree_tag_clear(&pag->pag_ici_root,
+                            XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                            XFS_ICI_EOFBLOCKS_TAG);
+       if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+               /* clear the eofblocks tag from the perag radix tree */
+               spin_lock(&ip->i_mount->m_perag_lock);
+               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                                    XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                    XFS_ICI_EOFBLOCKS_TAG);
+               spin_unlock(&ip->i_mount->m_perag_lock);
+               trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+                                              -1, _RET_IP_);
+       }
+
+       spin_unlock(&pag->pag_ici_lock);
+       xfs_perag_put(pag);
+}
+
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h

new file mode 100644 (file)

index 0000000..e0f138c
--- /dev/null
+++ b/fs/xfs/xfs_icache.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+struct xfs_perag;
+
+#define SYNC_WAIT              0x0001  /* wait for i/o to complete */
+#define SYNC_TRYLOCK           0x0002  /* only try to lock inodes */
+
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+            uint flags, uint lock_flags, xfs_inode_t **ipp);
+
+void xfs_reclaim_worker(struct work_struct *work);
+
+int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
+int xfs_reclaim_inodes_count(struct xfs_mount *mp);
+void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+void xfs_eofblocks_worker(struct work_struct *);
+
+int xfs_sync_inode_grab(struct xfs_inode *ip);
+int xfs_inode_ag_iterator(struct xfs_mount *mp,
+       int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+               int flags, void *args),
+       int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+       int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+               int flags, void *args),
+       int flags, void *args, int tag);
+
+#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c

deleted file mode 100644 (file)

index 784a803..0000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-
-
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
-       struct xfs_mount        *mp,
-       xfs_ino_t               ino)
-{
-       struct xfs_inode        *ip;
-
-       /*
-        * if this didn't occur in transactions, we could use
-        * KM_MAYFAIL and return NULL here on ENOMEM. Set the
-        * code up to do this anyway.
-        */
-       ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
-       if (!ip)
-               return NULL;
-       if (inode_init_always(mp->m_super, VFS_I(ip))) {
-               kmem_zone_free(xfs_inode_zone, ip);
-               return NULL;
-       }
-
-       ASSERT(atomic_read(&ip->i_pincount) == 0);
-       ASSERT(!spin_is_locked(&ip->i_flags_lock));
-       ASSERT(!xfs_isiflocked(ip));
-       ASSERT(ip->i_ino == 0);
-
-       mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-       /* initialise the xfs inode */
-       ip->i_ino = ino;
-       ip->i_mount = mp;
-       memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-       ip->i_afp = NULL;
-       memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
-       ip->i_flags = 0;
-       ip->i_delayed_blks = 0;
-       memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-
-       return ip;
-}
-
-STATIC void
-xfs_inode_free_callback(
-       struct rcu_head         *head)
-{
-       struct inode            *inode = container_of(head, struct inode, i_rcu);
-       struct xfs_inode        *ip = XFS_I(inode);
-
-       kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
-       struct xfs_inode        *ip)
-{
-       switch (ip->i_d.di_mode & S_IFMT) {
-       case S_IFREG:
-       case S_IFDIR:
-       case S_IFLNK:
-               xfs_idestroy_fork(ip, XFS_DATA_FORK);
-               break;
-       }
-
-       if (ip->i_afp)
-               xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
-       if (ip->i_itemp) {
-               ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
-               xfs_inode_item_destroy(ip);
-               ip->i_itemp = NULL;
-       }
-
-       /* asserts to verify all state is correct here */
-       ASSERT(atomic_read(&ip->i_pincount) == 0);
-       ASSERT(!spin_is_locked(&ip->i_flags_lock));
-       ASSERT(!xfs_isiflocked(ip));
-
-       /*
-        * Because we use RCU freeing we need to ensure the inode always
-        * appears to be reclaimed with an invalid inode number when in the
-        * free state. The ip->i_flags_lock provides the barrier against lookup
-        * races.
-        */
-       spin_lock(&ip->i_flags_lock);
-       ip->i_flags = XFS_IRECLAIM;
-       ip->i_ino = 0;
-       spin_unlock(&ip->i_flags_lock);
-
-       call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
-       struct xfs_perag        *pag,
-       struct xfs_inode        *ip,
-       xfs_ino_t               ino,
-       int                     flags,
-       int                     lock_flags) __releases(RCU)
-{
-       struct inode            *inode = VFS_I(ip);
-       struct xfs_mount        *mp = ip->i_mount;
-       int                     error;
-
-       /*
-        * check for re-use of an inode within an RCU grace period due to the
-        * radix tree nodes not being updated yet. We monitor for this by
-        * setting the inode number to zero before freeing the inode structure.
-        * If the inode has been reallocated and set up, then the inode number
-        * will not match, so check for that, too.
-        */
-       spin_lock(&ip->i_flags_lock);
-       if (ip->i_ino != ino) {
-               trace_xfs_iget_skip(ip);
-               XFS_STATS_INC(xs_ig_frecycle);
-               error = EAGAIN;
-               goto out_error;
-       }
-
-
-       /*
-        * If we are racing with another cache hit that is currently
-        * instantiating this inode or currently recycling it out of
-        * reclaimabe state, wait for the initialisation to complete
-        * before continuing.
-        *
-        * XXX(hch): eventually we should do something equivalent to
-        *           wait_on_inode to wait for these flags to be cleared
-        *           instead of polling for it.
-        */
-       if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
-               trace_xfs_iget_skip(ip);
-               XFS_STATS_INC(xs_ig_frecycle);
-               error = EAGAIN;
-               goto out_error;
-       }
-
-       /*
-        * If lookup is racing with unlink return an error immediately.
-        */
-       if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-               error = ENOENT;
-               goto out_error;
-       }
-
-       /*
-        * If IRECLAIMABLE is set, we've torn down the VFS inode already.
-        * Need to carefully get it back into useable state.
-        */
-       if (ip->i_flags & XFS_IRECLAIMABLE) {
-               trace_xfs_iget_reclaim(ip);
-
-               /*
-                * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-                * from stomping over us while we recycle the inode.  We can't
-                * clear the radix tree reclaimable tag yet as it requires
-                * pag_ici_lock to be held exclusive.
-                */
-               ip->i_flags |= XFS_IRECLAIM;
-
-               spin_unlock(&ip->i_flags_lock);
-               rcu_read_unlock();
-
-               error = -inode_init_always(mp->m_super, inode);
-               if (error) {
-                       /*
-                        * Re-initializing the inode failed, and we are in deep
-                        * trouble.  Try to re-add it to the reclaim list.
-                        */
-                       rcu_read_lock();
-                       spin_lock(&ip->i_flags_lock);
-
-                       ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-                       ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
-                       trace_xfs_iget_reclaim_fail(ip);
-                       goto out_error;
-               }
-
-               spin_lock(&pag->pag_ici_lock);
-               spin_lock(&ip->i_flags_lock);
-
-               /*
-                * Clear the per-lifetime state in the inode as we are now
-                * effectively a new inode and need to return to the initial
-                * state before reuse occurs.
-                */
-               ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
-               ip->i_flags |= XFS_INEW;
-               __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-               inode->i_state = I_NEW;
-
-               ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-               mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
-               spin_unlock(&ip->i_flags_lock);
-               spin_unlock(&pag->pag_ici_lock);
-       } else {
-               /* If the VFS inode is being torn down, pause and try again. */
-               if (!igrab(inode)) {
-                       trace_xfs_iget_skip(ip);
-                       error = EAGAIN;
-                       goto out_error;
-               }
-
-               /* We've got a live one. */
-               spin_unlock(&ip->i_flags_lock);
-               rcu_read_unlock();
-               trace_xfs_iget_hit(ip);
-       }
-
-       if (lock_flags != 0)
-               xfs_ilock(ip, lock_flags);
-
-       xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
-       XFS_STATS_INC(xs_ig_found);
-
-       return 0;
-
-out_error:
-       spin_unlock(&ip->i_flags_lock);
-       rcu_read_unlock();
-       return error;
-}
-
-
-static int
-xfs_iget_cache_miss(
-       struct xfs_mount        *mp,
-       struct xfs_perag        *pag,
-       xfs_trans_t             *tp,
-       xfs_ino_t               ino,
-       struct xfs_inode        **ipp,
-       int                     flags,
-       int                     lock_flags)
-{
-       struct xfs_inode        *ip;
-       int                     error;
-       xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
-       int                     iflags;
-
-       ip = xfs_inode_alloc(mp, ino);
-       if (!ip)
-               return ENOMEM;
-
-       error = xfs_iread(mp, tp, ip, flags);
-       if (error)
-               goto out_destroy;
-
-       trace_xfs_iget_miss(ip);
-
-       if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-               error = ENOENT;
-               goto out_destroy;
-       }
-
-       /*
-        * Preload the radix tree so we can insert safely under the
-        * write spinlock. Note that we cannot sleep inside the preload
-        * region. Since we can be called from transaction context, don't
-        * recurse into the file system.
-        */
-       if (radix_tree_preload(GFP_NOFS)) {
-               error = EAGAIN;
-               goto out_destroy;
-       }
-
-       /*
-        * Because the inode hasn't been added to the radix-tree yet it can't
-        * be found by another thread, so we can do the non-sleeping lock here.
-        */
-       if (lock_flags) {
-               if (!xfs_ilock_nowait(ip, lock_flags))
-                       BUG();
-       }
-
-       /*
-        * These values must be set before inserting the inode into the radix
-        * tree as the moment it is inserted a concurrent lookup (allowed by the
-        * RCU locking mechanism) can find it and that lookup must see that this
-        * is an inode currently under construction (i.e. that XFS_INEW is set).
-        * The ip->i_flags_lock that protects the XFS_INEW flag forms the
-        * memory barrier that ensures this detection works correctly at lookup
-        * time.
-        */
-       iflags = XFS_INEW;
-       if (flags & XFS_IGET_DONTCACHE)
-               iflags |= XFS_IDONTCACHE;
-       ip->i_udquot = ip->i_gdquot = NULL;
-       xfs_iflags_set(ip, iflags);
-
-       /* insert the new inode */
-       spin_lock(&pag->pag_ici_lock);
-       error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
-       if (unlikely(error)) {
-               WARN_ON(error != -EEXIST);
-               XFS_STATS_INC(xs_ig_dup);
-               error = EAGAIN;
-               goto out_preload_end;
-       }
-       spin_unlock(&pag->pag_ici_lock);
-       radix_tree_preload_end();
-
-       *ipp = ip;
-       return 0;
-
-out_preload_end:
-       spin_unlock(&pag->pag_ici_lock);
-       radix_tree_preload_end();
-       if (lock_flags)
-               xfs_iunlock(ip, lock_flags);
-out_destroy:
-       __destroy_inode(VFS_I(ip));
-       xfs_inode_free(ip);
-       return error;
-}
-
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *              for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
-       xfs_mount_t     *mp,
-       xfs_trans_t     *tp,
-       xfs_ino_t       ino,
-       uint            flags,
-       uint            lock_flags,
-       xfs_inode_t     **ipp)
-{
-       xfs_inode_t     *ip;
-       int             error;
-       xfs_perag_t     *pag;
-       xfs_agino_t     agino;
-
-       /*
-        * xfs_reclaim_inode() uses the ILOCK to ensure an inode
-        * doesn't get freed while it's being referenced during a
-        * radix tree traversal here.  It assumes this function
-        * aqcuires only the ILOCK (and therefore it has no need to
-        * involve the IOLOCK in this synchronization).
-        */
-       ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-
-       /* reject inode numbers outside existing AGs */
-       if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-               return EINVAL;
-
-       /* get the perag structure and ensure that it's inode capable */
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-       agino = XFS_INO_TO_AGINO(mp, ino);
-
-again:
-       error = 0;
-       rcu_read_lock();
-       ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-
-       if (ip) {
-               error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
-               if (error)
-                       goto out_error_or_again;
-       } else {
-               rcu_read_unlock();
-               XFS_STATS_INC(xs_ig_missed);
-
-               error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
-                                                       flags, lock_flags);
-               if (error)
-                       goto out_error_or_again;
-       }
-       xfs_perag_put(pag);
-
-       *ipp = ip;
-
-       /*
-        * If we have a real type for an on-disk inode, we can set ops(&unlock)
-        * now.  If it's a new inode being created, xfs_ialloc will handle it.
-        */
-       if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-               xfs_setup_inode(ip);
-       return 0;
-
-out_error_or_again:
-       if (error == EAGAIN) {
-               delay(1);
-               goto again;
-       }
-       xfs_perag_put(pag);
-       return error;
-}
-
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code.  It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format.  If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in.  Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared().  This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
-       xfs_inode_t     *ip)
-{
-       uint    lock_mode;
-
-       if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
-           ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
-               lock_mode = XFS_ILOCK_EXCL;
-       } else {
-               lock_mode = XFS_ILOCK_SHARED;
-       }
-
-       xfs_ilock(ip, lock_mode);
-
-       return lock_mode;
-}
-
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
-       xfs_inode_t     *ip,
-       unsigned int    lock_mode)
-{
-       xfs_iunlock(ip, lock_mode);
-}
-
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock.  This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- *       to be locked.  It can be:
- *             XFS_IOLOCK_SHARED,
- *             XFS_IOLOCK_EXCL,
- *             XFS_ILOCK_SHARED,
- *             XFS_ILOCK_EXCL,
- *             XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- *             XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- *             XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- *             XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
-       xfs_inode_t             *ip,
-       uint                    lock_flags)
-{
-       /*
-        * You can't set both SHARED and EXCL for the same lock,
-        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-        */
-       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-       if (lock_flags & XFS_IOLOCK_EXCL)
-               mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-       else if (lock_flags & XFS_IOLOCK_SHARED)
-               mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-
-       if (lock_flags & XFS_ILOCK_EXCL)
-               mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-       else if (lock_flags & XFS_ILOCK_SHARED)
-               mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-
-       trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep.  It returns 1 if it gets
- * the requested locks and 0 otherwise.  If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be locked.  See the comment for xfs_ilock() for a list
- *      of valid values.
- */
-int
-xfs_ilock_nowait(
-       xfs_inode_t             *ip,
-       uint                    lock_flags)
-{
-       /*
-        * You can't set both SHARED and EXCL for the same lock,
-        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-        */
-       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
-       if (lock_flags & XFS_IOLOCK_EXCL) {
-               if (!mrtryupdate(&ip->i_iolock))
-                       goto out;
-       } else if (lock_flags & XFS_IOLOCK_SHARED) {
-               if (!mrtryaccess(&ip->i_iolock))
-                       goto out;
-       }
-       if (lock_flags & XFS_ILOCK_EXCL) {
-               if (!mrtryupdate(&ip->i_lock))
-                       goto out_undo_iolock;
-       } else if (lock_flags & XFS_ILOCK_SHARED) {
-               if (!mrtryaccess(&ip->i_lock))
-                       goto out_undo_iolock;
-       }
-       trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
-       return 1;
-
- out_undo_iolock:
-       if (lock_flags & XFS_IOLOCK_EXCL)
-               mrunlock_excl(&ip->i_iolock);
-       else if (lock_flags & XFS_IOLOCK_SHARED)
-               mrunlock_shared(&ip->i_iolock);
- out:
-       return 0;
-}
-
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be unlocked.  See the comment for xfs_ilock() for a list
- *      of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
-       xfs_inode_t             *ip,
-       uint                    lock_flags)
-{
-       /*
-        * You can't set both SHARED and EXCL for the same lock,
-        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-        */
-       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-       ASSERT(lock_flags != 0);
-
-       if (lock_flags & XFS_IOLOCK_EXCL)
-               mrunlock_excl(&ip->i_iolock);
-       else if (lock_flags & XFS_IOLOCK_SHARED)
-               mrunlock_shared(&ip->i_iolock);
-
-       if (lock_flags & XFS_ILOCK_EXCL)
-               mrunlock_excl(&ip->i_lock);
-       else if (lock_flags & XFS_ILOCK_SHARED)
-               mrunlock_shared(&ip->i_lock);
-
-       trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * give up write locks.  the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
-       xfs_inode_t             *ip,
-       uint                    lock_flags)
-{
-       ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
-       ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-
-       if (lock_flags & XFS_ILOCK_EXCL)
-               mrdemote(&ip->i_lock);
-       if (lock_flags & XFS_IOLOCK_EXCL)
-               mrdemote(&ip->i_iolock);
-
-       trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-
-#ifdef DEBUG
-int
-xfs_isilocked(
-       xfs_inode_t             *ip,
-       uint                    lock_flags)
-{
-       if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-               if (!(lock_flags & XFS_ILOCK_SHARED))
-                       return !!ip->i_lock.mr_writer;
-               return rwsem_is_locked(&ip->i_lock.mr_lock);
-       }
-
-       if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-               if (!(lock_flags & XFS_IOLOCK_SHARED))
-                       return !!ip->i_iolock.mr_writer;
-               return rwsem_is_locked(&ip->i_iolock.mr_lock);
-       }
-
-       ASSERT(0);
-       return 0;
-}
-#endif
-
-void
-__xfs_iflock(
-       struct xfs_inode        *ip)
-{
-       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
-       do {
-               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-               if (xfs_isiflocked(ip))
-                       io_schedule();
-       } while (!xfs_iflock_nowait(ip));
-
-       finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c

index 1938b41ee9f51bfef8f0ffad3e8b83e059ebcf41..66282dcb821bfe2e969f80b35e2eaa42a98fc6ed 100644 (file)
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
  #include "xfs_filestream.h"
  #include "xfs_vnodeops.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  kmem_zone_t *xfs_ifork_zone;
  kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
         return 0;
  }
  
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code.  It is used in places that wish to lock the inode solely
+ * for reading the extents.  The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in.  Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though.  What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+       xfs_inode_t     *ip)
+{
+       uint    lock_mode;
+
+       if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+           ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+               lock_mode = XFS_ILOCK_EXCL;
+       } else {
+               lock_mode = XFS_ILOCK_SHARED;
+       }
+
+       xfs_ilock(ip, lock_mode);
+
+       return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+       xfs_inode_t     *ip,
+       unsigned int    lock_mode)
+{
+       xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *             XFS_IOLOCK_SHARED,
+ *             XFS_IOLOCK_EXCL,
+ *             XFS_ILOCK_SHARED,
+ *             XFS_ILOCK_EXCL,
+ *             XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *             XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *             XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *             XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+       /*
+        * You can't set both SHARED and EXCL for the same lock,
+        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+       else if (lock_flags & XFS_IOLOCK_SHARED)
+               mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+       if (lock_flags & XFS_ILOCK_EXCL)
+               mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+       else if (lock_flags & XFS_ILOCK_SHARED)
+               mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *      of valid values.
+ */
+int
+xfs_ilock_nowait(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+       /*
+        * You can't set both SHARED and EXCL for the same lock,
+        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+       if (lock_flags & XFS_IOLOCK_EXCL) {
+               if (!mrtryupdate(&ip->i_iolock))
+                       goto out;
+       } else if (lock_flags & XFS_IOLOCK_SHARED) {
+               if (!mrtryaccess(&ip->i_iolock))
+                       goto out;
+       }
+       if (lock_flags & XFS_ILOCK_EXCL) {
+               if (!mrtryupdate(&ip->i_lock))
+                       goto out_undo_iolock;
+       } else if (lock_flags & XFS_ILOCK_SHARED) {
+               if (!mrtryaccess(&ip->i_lock))
+                       goto out_undo_iolock;
+       }
+       return 1;
+
+ out_undo_iolock:
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrunlock_excl(&ip->i_iolock);
+       else if (lock_flags & XFS_IOLOCK_SHARED)
+               mrunlock_shared(&ip->i_iolock);
+ out:
+       return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *      of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       /*
+        * You can't set both SHARED and EXCL for the same lock,
+        * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+        * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+        */
+       ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+              (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+       ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+              (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+       ASSERT(lock_flags != 0);
+
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrunlock_excl(&ip->i_iolock);
+       else if (lock_flags & XFS_IOLOCK_SHARED)
+               mrunlock_shared(&ip->i_iolock);
+
+       if (lock_flags & XFS_ILOCK_EXCL)
+               mrunlock_excl(&ip->i_lock);
+       else if (lock_flags & XFS_ILOCK_SHARED)
+               mrunlock_shared(&ip->i_lock);
+
+       trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+       ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+       if (lock_flags & XFS_ILOCK_EXCL)
+               mrdemote(&ip->i_lock);
+       if (lock_flags & XFS_IOLOCK_EXCL)
+               mrdemote(&ip->i_iolock);
+
+       trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+       xfs_inode_t             *ip,
+       uint                    lock_flags)
+{
+       if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+               if (!(lock_flags & XFS_ILOCK_SHARED))
+                       return !!ip->i_lock.mr_writer;
+               return rwsem_is_locked(&ip->i_lock.mr_lock);
+       }
+
+       if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+               if (!(lock_flags & XFS_IOLOCK_SHARED))
+                       return !!ip->i_iolock.mr_writer;
+               return rwsem_is_locked(&ip->i_iolock.mr_lock);
+       }
+
+       ASSERT(0);
+       return 0;
+}
+#endif
+
+void
+__xfs_iflock(
+       struct xfs_inode        *ip)
+{
+       wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+       DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+       do {
+               prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+               if (xfs_isiflocked(ip))
+                       io_schedule();
+       } while (!xfs_iflock_nowait(ip));
+
+       finish_wait(wq, &wait.wait);
+}
+
  #ifdef DEBUG
  /*
   * Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
  }
  #endif
  
+static void
+xfs_inode_buf_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       int             i;
+       int             ni;
+
+       /*
+        * Validate the magic number and version of every inode in the buffer
+        */
+       ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+       for (i = 0; i < ni; i++) {
+               int             di_ok;
+               xfs_dinode_t    *dip;
+
+               dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                       (i << mp->m_sb.sb_inodelog));
+               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                           XFS_DINODE_GOOD_VERSION(dip->di_version);
+               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                               XFS_ERRTAG_ITOBP_INOTOBP,
+                                               XFS_RANDOM_ITOBP_INOTOBP))) {
+                       xfs_buf_ioerror(bp, EFSCORRUPTED);
+                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+                                            mp, dip);
+#ifdef DEBUG
+                       xfs_emerg(mp,
+                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                               (unsigned long long)bp->b_bn, i,
+                               be16_to_cpu(dip->di_magic));
+                       ASSERT(0);
+#endif
+               }
+       }
+       xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp);
+}
+
+static void
+xfs_inode_buf_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_inode_buf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+       .verify_read = xfs_inode_buf_read_verify,
+       .verify_write = xfs_inode_buf_write_verify,
+};
+
+
  /*
   * This routine is called to map an inode to the buffer containing the on-disk
   * version of the inode.  It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
         struct xfs_mount        *mp,
         struct xfs_trans        *tp,
         struct xfs_imap         *imap,
-       struct xfs_dinode       **dipp,
+       struct xfs_dinode       **dipp,
         struct xfs_buf          **bpp,
         uint                    buf_flags,
         uint                    iget_flags)
  {
         struct xfs_buf          *bp;
         int                     error;
-       int                     i;
-       int                     ni;
  
         buf_flags |= XBF_UNMAPPED;
         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                  (int)imap->im_len, buf_flags, &bp);
+                                  (int)imap->im_len, buf_flags, &bp,
+                                  &xfs_inode_buf_ops);
         if (error) {
-               if (error != EAGAIN) {
-                       xfs_warn(mp,
-                               "%s: xfs_trans_read_buf() returned error %d.",
-                               __func__, error);
-               } else {
+               if (error == EAGAIN) {
                         ASSERT(buf_flags & XBF_TRYLOCK);
+                       return error;
                 }
-               return error;
-       }
-
-       /*
-        * Validate the magic number and version of every inode in the buffer
-        * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-        */
-#ifdef DEBUG
-       ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
-#else  /* usual case */
-       ni = 1;
-#endif
  
-       for (i = 0; i < ni; i++) {
-               int             di_ok;
-               xfs_dinode_t    *dip;
+               if (error == EFSCORRUPTED &&
+                   (iget_flags & XFS_IGET_UNTRUSTED))
+                       return XFS_ERROR(EINVAL);
  
-               dip = (xfs_dinode_t *)xfs_buf_offset(bp,
-                                       (i << mp->m_sb.sb_inodelog));
-               di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
-                           XFS_DINODE_GOOD_VERSION(dip->di_version);
-               if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                               XFS_ERRTAG_ITOBP_INOTOBP,
-                                               XFS_RANDOM_ITOBP_INOTOBP))) {
-                       if (iget_flags & XFS_IGET_UNTRUSTED) {
-                               xfs_trans_brelse(tp, bp);
-                               return XFS_ERROR(EINVAL);
-                       }
-                       XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
-                                            mp, dip);
-#ifdef DEBUG
-                       xfs_emerg(mp,
-                               "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                               (unsigned long long)imap->im_blkno, i,
-                               be16_to_cpu(dip->di_magic));
-                       ASSERT(0);
-#endif
-                       xfs_trans_brelse(tp, bp);
-                       return XFS_ERROR(EFSCORRUPTED);
-               }
+               xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+                       __func__, error);
+               return error;
         }
  
-       xfs_inobp_check(mp, bp);
-
         *bpp = bp;
         *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
         return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
   * set according to the contents of the given cred structure.
   *
   * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
- * to obtain the in-core version of the allocated inode.  Finally,
- * fill in the inode and log its initial contents.  In this case,
- * ialloc_context would be set to NULL and call_again set to false.
+ * has a free inode available, call xfs_iget() to obtain the in-core
+ * version of the allocated inode.  Finally, fill in the inode and
+ * log its initial contents.  In this case, ialloc_context would be
+ * set to NULL.
   *
- * If xfs_dialloc() does not have an available inode,
- * it will replenish its supply by doing an allocation. Since we can
- * only do one allocation within a transaction without deadlocks, we
- * must commit the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * If xfs_dialloc() does not have an available inode, it will replenish
+ * its supply by doing an allocation. Since we can only do one
+ * allocation within a transaction without deadlocks, we must commit
+ * the current transaction before returning the inode itself.
+ * In this case, therefore, we will set ialloc_context and return.
   * The caller should then commit the current transaction, start a new
   * transaction, and call xfs_ialloc() again to actually get the inode.
   *
@@ -1514,6 +1786,18 @@ xfs_ifree_cluster(
  
                 if (!bp)
                         return ENOMEM;
+
+               /*
+                * This buffer may not have been correctly initialised as we
+                * didn't read it from disk. That's not important because we are
+                * only using to mark the buffer as stale in the log, and to
+                * attach stale cached inodes on it. That means it will never be
+                * dispatched for IO. If it is, we want to know about it, and we
+                * want it to fail. We can acheive this by adding a write
+                * verifier to the buffer.
+                */
+                bp->b_ops = &xfs_inode_buf_ops;
+
                 /*
                  * Walk the inodes already attached to the buffer and mark them
                  * stale. These will all have the flush locks held, so an
@@ -3661,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
                 ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
         }
  }
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+       /* prealloc/delalloc exists only on regular files */
+       if (!S_ISREG(ip->i_d.di_mode))
+               return false;
+
+       /*
+        * Zero sized files with no cached pages and delalloc blocks will not
+        * have speculative prealloc/delalloc blocks to remove.
+        */
+       if (VFS_I(ip)->i_size == 0 &&
+           VN_CACHED(VFS_I(ip)) == 0 &&
+           ip->i_delayed_blks == 0)
+               return false;
+
+       /* If we haven't read in the extent list, then don't do it now. */
+       if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+               return false;
+
+       /*
+        * Do not free real preallocated or append-only files unless the file
+        * has delalloc blocks and we are forced to remove them.
+        */
+       if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+               if (!force || ip->i_delayed_blks == 0)
+                       return false;
+
+       return true;
+}
+
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h

index 94b32f906e7903a32cc1e1474f3d4a6fed267f93..22baf6ea4fac8435af2bf38e23f9bf973bbd8643 100644 (file)
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
         (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
          ((pip)->i_d.di_mode & S_ISGID))
  
+
  /*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
   */
-int            xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                        uint, uint, xfs_inode_t **);
  void           xfs_ilock(xfs_inode_t *, uint);
  int            xfs_ilock_nowait(xfs_inode_t *, uint);
  void           xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void               xfs_ilock_demote(xfs_inode_t *, uint);
  int            xfs_isilocked(xfs_inode_t *, uint);
  uint           xfs_ilock_map_shared(xfs_inode_t *);
  void           xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void           xfs_inode_free(struct xfs_inode *ip);
-
-/*
- * xfs_inode.c prototypes.
- */
  int            xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
                            xfs_nlink_t, xfs_dev_t, prid_t, int,
                            struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void                xfs_iext_irec_compact(xfs_ifork_t *);
  void           xfs_iext_irec_compact_pages(xfs_ifork_t *);
  void           xfs_iext_irec_compact_full(xfs_ifork_t *);
  void           xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+bool           xfs_can_free_eofblocks(struct xfs_inode *, bool);
  
  #define xfs_ipincount(ip)      ((unsigned int) atomic_read(&ip->i_pincount))
  
@@ -603,5 +598,6 @@ void                xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
  extern struct kmem_zone        *xfs_ifork_zone;
  extern struct kmem_zone        *xfs_inode_zone;
  extern struct kmem_zone        *xfs_ili_zone;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
  
  #endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c

index c1df3c623de203f7d941ef0b706c76de270ada68..c1c3ef88a260278fb83ca4014a890284b8df5f42 100644 (file)
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
  #include "xfs_inode_item.h"
  #include "xfs_export.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  #include <linux/capability.h>
  #include <linux/dcache.h>
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
                 error = xfs_errortag_clearall(mp, 1);
                 return -error;
  
+       case XFS_IOC_FREE_EOFBLOCKS: {
+               struct xfs_eofblocks eofb;
+
+               if (copy_from_user(&eofb, arg, sizeof(eofb)))
+                       return -XFS_ERROR(EFAULT);
+
+               if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
+                       return -XFS_ERROR(EINVAL);
+
+               if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
+                       return -XFS_ERROR(EINVAL);
+
+               if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
+                   memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
+                       return -XFS_ERROR(EINVAL);
+
+               error = xfs_icache_free_eofblocks(mp, &eofb);
+               return -error;
+       }
+
         default:
                 return -ENOTTY;
         }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c

index 7f537663365b08436f1ad29015b77df33f691109..add06b4e9a635511afc3e2716836e210ff794c46 100644 (file)
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
  #include "xfs_utils.h"
  #include "xfs_iomap.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  
  #define XFS_WRITEIO_ALIGN(mp,off)      (((off) >> mp->m_writeio_log) \
@@ -373,7 +374,7 @@ xfs_iomap_write_delay(
         xfs_extlen_t    extsz;
         int             nimaps;
         xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-       int             prealloc, flushed = 0;
+       int             prealloc;
         int             error;
  
         ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +435,29 @@ retry:
         }
  
         /*
-        * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
-        * ENOSPC, * flush all other inodes with delalloc blocks to free up
-        * some of the excess reserved metadata space. For both cases, retry
+        * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
          * without EOF preallocation.
          */
         if (nimaps == 0) {
                 trace_xfs_delalloc_enospc(ip, offset, count);
-               if (flushed)
-                       return XFS_ERROR(error ? error : ENOSPC);
-
-               if (error == ENOSPC) {
-                       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                       xfs_flush_inodes(ip);
-                       xfs_ilock(ip, XFS_ILOCK_EXCL);
+               if (prealloc) {
+                       prealloc = 0;
+                       error = 0;
+                       goto retry;
                 }
-
-               flushed = 1;
-               error = 0;
-               prealloc = 0;
-               goto retry;
+               return XFS_ERROR(error ? error : ENOSPC);
         }
  
         if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
                 return xfs_alert_fsblock_zero(ip, &imap[0]);
  
+       /*
+        * Tag the inode as speculatively preallocated so we can reclaim this
+        * space on demand, if necessary.
+        */
+       if (prealloc)
+               xfs_inode_set_eofblocks_tag(ip);
+
         *ret_imap = imap[0];
         return 0;
  }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c

index 4e00cf091d2ccac6b0a9e65113a67b2b5169f21f..d82efaa2ac7350553c8804c014c5f299809af178 100644 (file)
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
  #include "xfs_vnodeops.h"
  #include "xfs_inode_item.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  #include <linux/capability.h>
  #include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
          * care about here.
          */
         if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-               error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
-                                       FI_NONE);
+               error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                                                     ip->i_d.di_size, newsize);
                 if (error)
                         goto out_unlock;
         }
@@ -854,6 +855,9 @@ xfs_setattr_size(
                  * and do not wait the usual (long) time for writeout.
                  */
                 xfs_iflags_set(ip, XFS_ITRUNCATED);
+
+               /* A truncate down always removes post-EOF blocks. */
+               xfs_inode_clear_eofblocks_tag(ip);
         }
  
         if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c

index 01d10a66e30243518d293f2c72a29b6135f7977b..2ea7d402188db8596e4c04a231d29cf0cb506756 100644 (file)
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
  #include "xfs_error.h"
  #include "xfs_btree.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  STATIC int
  xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
                                         if (xfs_inobt_maskn(chunkidx, nicluster)
                                                         & ~r.ir_free)
                                                 xfs_btree_reada_bufs(mp, agno,
-                                                       agbno, nbcluster);
+                                                       agbno, nbcluster,
+                                                       &xfs_inode_buf_ops);
                                 }
                                 irbp->ir_startino = r.ir_startino;
                                 irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h

index 828662f70d64ec2f0bf43a66a6832c77debbdf18..fe7e4df85a7b9b2e244fd44599adcf76faefecd0 100644 (file)
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
  #include <linux/kernel.h>
  #include <linux/blkdev.h>
  #include <linux/slab.h>
+#include <linux/crc32c.h>
  #include <linux/module.h>
  #include <linux/mutex.h>
  #include <linux/file.h>
@@ -118,6 +119,7 @@
  #define xfs_rotorstep          xfs_params.rotorstep.val
  #define xfs_inherit_nodefrag   xfs_params.inherit_nodfrg.val
  #define xfs_fstrm_centisecs    xfs_params.fstrm_timer.val
+#define xfs_eofb_secs          xfs_params.eofb_timer.val
  
  #define current_cpu()          (raw_smp_processor_id())
  #define current_pid()          (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c

index 4dad756962d02e1b29c703b5b54ccee07d91fabf..46bd9d52ab518a57f4ef697d45b310060f0f6d10 100644 (file)
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
  #include "xfs_dinode.h"
  #include "xfs_inode.h"
  #include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_cksum.h"
  
  kmem_zone_t    *xfs_log_ticket_zone;
  
@@ -458,7 +460,8 @@ xfs_log_reserve(
         tic->t_trans_type = t_type;
         *ticp = tic;
  
-       xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+       xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+                                           : tic->t_unit_res);
  
         trace_xfs_log_reserve(log, tic);
  
@@ -679,25 +682,29 @@ out:
  }
  
  /*
- * Finish the recovery of the file system.  This is separate from
- * the xfs_log_mount() call, because it depends on the code in
- * xfs_mountfs() to read in the root and real-time bitmap inodes
- * between calling xfs_log_mount() and here.
+ * Finish the recovery of the file system.  This is separate from the
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
+ * here.
   *
- * mp          - ubiquitous xfs mount point structure
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
   */
  int
  xfs_log_mount_finish(xfs_mount_t *mp)
  {
-       int     error;
+       int     error = 0;
  
-       if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+       if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
                 error = xlog_recover_finish(mp->m_log);
-       else {
-               error = 0;
+               if (!error)
+                       xfs_log_work_queue(mp);
+       } else {
                 ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
         }
  
+
         return error;
  }
  
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
  }      /* xfs_log_unmount_write */
  
  /*
- * Deallocate log structures for unmount/relocation.
+ * Empty the log for unmount/freeze.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record.
+ */
+void
+xfs_log_quiesce(
+       struct xfs_mount        *mp)
+{
+       cancel_delayed_work_sync(&mp->m_log->l_work);
+       xfs_log_force(mp, XFS_LOG_SYNC);
+
+       /*
+        * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+        * will push it, xfs_wait_buftarg() will not wait for it. Further,
+        * xfs_buf_iowait() cannot be used because it was pushed with the
+        * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+        * the IO to complete.
+        */
+       xfs_ail_push_all_sync(mp->m_ail);
+       xfs_wait_buftarg(mp->m_ddev_targp);
+       xfs_buf_lock(mp->m_sb_bp);
+       xfs_buf_unlock(mp->m_sb_bp);
+
+       xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
   *
- * We need to stop the aild from running before we destroy
- * and deallocate the log as the aild references the log.
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
   */
  void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_unmount(
+       struct xfs_mount        *mp)
  {
-       cancel_delayed_work_sync(&mp->m_sync_work);
+       xfs_log_quiesce(mp);
+
         xfs_trans_ail_destroy(mp);
         xlog_dealloc_log(mp->m_log);
  }
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
          * with it being freed after writing the unmount record to the
          * log.
          */
-
-}      /* xlog_iodone */
+}
  
  /*
   * Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
  }      /* xlog_get_iclog_buffer_size */
  
  
+void
+xfs_log_work_queue(
+       struct xfs_mount        *mp)
+{
+       queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+                               msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+       struct work_struct      *work)
+{
+       struct xlog             *log = container_of(to_delayed_work(work),
+                                               struct xlog, l_work);
+       struct xfs_mount        *mp = log->l_mp;
+
+       /* dgc: errors ignored - not fatal and nowhere to report them */
+       if (xfs_log_need_covered(mp))
+               xfs_fs_log_dummy(mp);
+       else
+               xfs_log_force(mp, 0);
+
+       /* start pushing all the metadata that is currently dirty */
+       xfs_ail_push_all(mp->m_ail);
+
+       /* queue us up again */
+       xfs_log_work_queue(mp);
+}
+
  /*
   * This routine initializes some of the log structure for a given mount point.
   * Its primary purpose is to fill in enough, so recovery can occur.  However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
         log->l_logBBsize   = num_bblks;
         log->l_covered_state = XLOG_STATE_COVER_IDLE;
         log->l_flags       |= XLOG_ACTIVE_RECOVERY;
+       INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
  
         log->l_prev_block  = -1;
         /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1416,6 +1491,84 @@ xlog_grant_push_ail(
                 xfs_ail_push(log->l_ailp, threshold_lsn);
  }
  
+/*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+       struct xlog             *log,
+       struct xlog_in_core     *iclog,
+       int                     roundoff)
+{
+       int                     i, j, k;
+       int                     size = iclog->ic_offset + roundoff;
+       __be32                  cycle_lsn;
+       xfs_caddr_t             dp;
+
+       cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+       dp = iclog->ic_datap;
+       for (i = 0; i < BTOBB(size); i++) {
+               if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+                       break;
+               iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+               *(__be32 *)dp = cycle_lsn;
+               dp += BBSIZE;
+       }
+
+       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+               xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+               for ( ; i < BTOBB(size); i++) {
+                       j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                       k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                       xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+                       *(__be32 *)dp = cycle_lsn;
+                       dp += BBSIZE;
+               }
+
+               for (i = 1; i < log->l_iclog_heads; i++)
+                       xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+       }
+}
+
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+       struct xlog             *log,
+       struct xlog_rec_header  *rhead,
+       char                    *dp,
+       int                     size)
+{
+       __uint32_t              crc;
+
+       /* first generate the crc for the record header ... */
+       crc = xfs_start_cksum((char *)rhead,
+                             sizeof(struct xlog_rec_header),
+                             offsetof(struct xlog_rec_header, h_crc));
+
+       /* ... then for additional cycle data for v2 logs ... */
+       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+               union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+               int             i;
+
+               for (i = 1; i < log->l_iclog_heads; i++) {
+                       crc = crc32c(crc, &xhdr[i].hic_xheader,
+                                    sizeof(struct xlog_rec_ext_header));
+               }
+       }
+
+       /* ... and finally for the payload */
+       crc = crc32c(crc, dp, size);
+
+       return xfs_end_cksum(crc);
+}
+
  /*
   * The bdstrat callback function for log bufs. This gives us a central
   * place to trap bufs in case we get hit by a log I/O error and need to
@@ -1476,7 +1629,6 @@ xlog_sync(
         struct xlog             *log,
         struct xlog_in_core     *iclog)
  {
-       xfs_caddr_t     dptr;           /* pointer to byte sized element */
         xfs_buf_t       *bp;
         int             i;
         uint            count;          /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
         int             split = 0;      /* split write into two regions */
         int             error;
         int             v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+       int             size;
  
         XFS_STATS_INC(xs_log_writes);
         ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
         xlog_pack_data(log, iclog, roundoff); 
  
         /* real byte length */
-       if (v2) {
-               iclog->ic_header.h_len =
-                       cpu_to_be32(iclog->ic_offset + roundoff);
-       } else {
-               iclog->ic_header.h_len =
-                       cpu_to_be32(iclog->ic_offset);
-       }
+       size = iclog->ic_offset;
+       if (v2)
+               size += roundoff;
+       iclog->ic_header.h_len = cpu_to_be32(size);
  
         bp = iclog->ic_bp;
         XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
  
         /* Do we need to split this write into 2 parts? */
         if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+               char            *dptr;
+
                 split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
                 count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
-               iclog->ic_bwritecnt = 2;        /* split into 2 writes */
+               iclog->ic_bwritecnt = 2;
+
+               /*
+                * Bump the cycle numbers at the start of each block in the
+                * part of the iclog that ends up in the buffer that gets
+                * written to the start of the log.
+                *
+                * Watch out for the header magic number case, though.
+                */
+               dptr = (char *)&iclog->ic_header + count;
+               for (i = 0; i < split; i += BBSIZE) {
+                       __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+                       if (++cycle == XLOG_HEADER_MAGIC_NUM)
+                               cycle++;
+                       *(__be32 *)dptr = cpu_to_be32(cycle);
+
+                       dptr += BBSIZE;
+               }
         } else {
                 iclog->ic_bwritecnt = 1;
         }
+
+       /* calculcate the checksum */
+       iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+                                           iclog->ic_datap, size);
+
         bp->b_io_length = BTOBB(count);
         bp->b_fspriv = iclog;
         XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
                 bp->b_flags |= XBF_SYNCIO;
                 if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                         bp->b_flags |= XBF_FUA;
-               dptr = bp->b_addr;
-               /*
-                * Bump the cycle numbers at the start of each block
-                * since this part of the buffer is at the start of
-                * a new cycle.  Watch out for the header magic number
-                * case, though.
-                */
-               for (i = 0; i < split; i += BBSIZE) {
-                       be32_add_cpu((__be32 *)dptr, 1);
-                       if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
-                               be32_add_cpu((__be32 *)dptr, 1);
-                       dptr += BBSIZE;
-               }
  
                 ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
                 ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
         return 0;
  }      /* xlog_sync */
  
-
  /*
   * Deallocate a log structure
   */
@@ -3713,3 +3873,4 @@ xlog_iclogs_empty(
         } while (iclog != log->l_iclog);
         return 1;
  }
+
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h

index 748d312850e2fc7da07509971f9afbf7aa9cb6d4..5caee96059dfb3a9fe5a1f03cd84c868674626b0 100644 (file)
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                 xfs_lsn_t *commit_lsn, int flags);
  bool   xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
  
+void   xfs_log_work_queue(struct xfs_mount *mp);
+void   xfs_log_worker(struct work_struct *work);
+void   xfs_log_quiesce(struct xfs_mount *mp);
+
  #endif
  #endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h

index 18a801d76a4234e550427a83b6d6a65b7092d670..16d8d12ea3b472cbf57d714f2952f4782e4c8a41 100644 (file)
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
  /*
   * Flags for log structure
   */
-#define XLOG_CHKSUM_MISMATCH   0x1     /* used only during recovery */
  #define XLOG_ACTIVE_RECOVERY   0x2     /* in the middle of recovery */
  #define        XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
  #define XLOG_IO_ERROR          0x8     /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
         __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
         __be64    h_lsn;        /* lsn of this LR                       :  8 */
         __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
-       __be32    h_chksum;     /* may not be used; non-zero if used    :  4 */
+       __le32    h_crc;        /* crc of log record                    :  4 */
         __be32    h_prev_block; /* block number to previous LR          :  4 */
         __be32    h_num_logops; /* number of log operations in this LR  :  4 */
         __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
         struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                  * wrapping */
         struct xfs_buftarg      *l_targ;        /* buftarg of log */
+       struct delayed_work     l_work;         /* background flush work */
         uint                    l_flags;
         uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
         struct list_head        *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
  extern int
  xlog_recover_finish(
         struct xlog             *log);
-extern void
-xlog_pack_data(
-       struct xlog             *log,
-       struct xlog_in_core     *iclog,
-       int);
+
+extern __le32   xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+                           char *dp, int size);
  
  extern kmem_zone_t *xfs_log_ticket_zone;
  struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c

index d308749fabf126a5b318c5d55f0e58e5589ef14f..96fcbb85ff835d0223e292125f501482626bc60c 100644 (file)
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
  #include "xfs_trans_priv.h"
  #include "xfs_quota.h"
  #include "xfs_utils.h"
+#include "xfs_cksum.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  STATIC int
  xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
                 buf_flags |= XBF_UNMAPPED;
  
         bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
-                         buf_flags);
+                         buf_flags, NULL);
         if (!bp)
                 return XFS_ERROR(ENOMEM);
         error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
         }
         trace_xfs_log_recover_inode_recover(log, in_f);
  
-       bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
+       bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+                         NULL);
         if (!bp) {
                 error = ENOMEM;
                 goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
         ASSERT(dq_f->qlf_len == 1);
  
         error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
-                                  XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+                                  XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+                                  NULL);
         if (error)
                 return error;
  
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
         mp->m_dmevmask = mp_dmevmask;
  }
  
-
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
-       struct xlog             *log,
-       struct xlog_in_core     *iclog,
-       int                     size)
-{
-       int             i;
-       __be32          *up;
-       uint            chksum = 0;
-
-       up = (__be32 *)iclog->ic_datap;
-       /* divide length by 4 to get # words */
-       for (i = 0; i < (size >> 2); i++) {
-               chksum ^= be32_to_cpu(*up);
-               up++;
-       }
-       iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
-
  /*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
   */
-void
-xlog_pack_data(
-       struct xlog             *log,
-       struct xlog_in_core     *iclog,
-       int                     roundoff)
+STATIC int
+xlog_unpack_data_crc(
+       struct xlog_rec_header  *rhead,
+       xfs_caddr_t             dp,
+       struct xlog             *log)
  {
-       int                     i, j, k;
-       int                     size = iclog->ic_offset + roundoff;
-       __be32                  cycle_lsn;
-       xfs_caddr_t             dp;
-
-       xlog_pack_data_checksum(log, iclog, size);
-
-       cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
-
-       dp = iclog->ic_datap;
-       for (i = 0; i < BTOBB(size) &&
-               i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
-               iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
-               *(__be32 *)dp = cycle_lsn;
-               dp += BBSIZE;
-       }
-
-       if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-               xlog_in_core_2_t *xhdr = iclog->ic_data;
-
-               for ( ; i < BTOBB(size); i++) {
-                       j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-                       k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-                       xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
-                       *(__be32 *)dp = cycle_lsn;
-                       dp += BBSIZE;
+       __le32                  crc;
+
+       crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+       if (crc != rhead->h_crc) {
+               if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+                       xfs_alert(log->l_mp,
+               "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
+                                       le32_to_cpu(rhead->h_crc),
+                                       le32_to_cpu(crc));
+                       xfs_hex_dump(dp, 32);
                 }
  
-               for (i = 1; i < log->l_iclog_heads; i++) {
-                       xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
-               }
+               /*
+                * If we've detected a log record corruption, then we can't
+                * recover past this point. Abort recovery if we are enforcing
+                * CRC protection by punting an error back up the stack.
+                */
+               if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+                       return EFSCORRUPTED;
         }
+
+       return 0;
  }
  
-STATIC void
+STATIC int
  xlog_unpack_data(
         struct xlog_rec_header  *rhead,
         xfs_caddr_t             dp,
         struct xlog             *log)
  {
         int                     i, j, k;
+       int                     error;
+
+       error = xlog_unpack_data_crc(rhead, dp, log);
+       if (error)
+               return error;
  
         for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
                   i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
                         dp += BBSIZE;
                 }
         }
+
+       return 0;
  }
  
  STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
                         if (error)
                                 goto bread_err2;
  
-                       xlog_unpack_data(rhead, offset, log);
-                       if ((error = xlog_recover_process_data(log,
-                                               rhash, rhead, offset, pass)))
+                       error = xlog_unpack_data(rhead, offset, log);
+                       if (error)
+                               goto bread_err2;
+
+                       error = xlog_recover_process_data(log,
+                                               rhash, rhead, offset, pass);
+                       if (error)
                                 goto bread_err2;
                         blk_no += bblks + hblks;
                 }
@@ -3546,9 +3534,14 @@ xlog_do_recovery_pass(
                                 if (error)
                                         goto bread_err2;
                         }
-                       xlog_unpack_data(rhead, offset, log);
-                       if ((error = xlog_recover_process_data(log, rhash,
-                                                       rhead, offset, pass)))
+
+                       error = xlog_unpack_data(rhead, offset, log);
+                       if (error)
+                               goto bread_err2;
+
+                       error = xlog_recover_process_data(log, rhash,
+                                                       rhead, offset, pass);
+                       if (error)
                                 goto bread_err2;
                         blk_no += bblks;
                 }
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
                         if (error)
                                 goto bread_err2;
  
-                       xlog_unpack_data(rhead, offset, log);
-                       if ((error = xlog_recover_process_data(log, rhash,
-                                                       rhead, offset, pass)))
+                       error = xlog_unpack_data(rhead, offset, log);
+                       if (error)
+                               goto bread_err2;
+
+                       error = xlog_recover_process_data(log, rhash,
+                                                       rhead, offset, pass);
+                       if (error)
                                 goto bread_err2;
                         blk_no += bblks + hblks;
                 }
@@ -3689,13 +3686,14 @@ xlog_do_recover(
  
         /*
          * Now that we've finished replaying all buffer and inode
-        * updates, re-read in the superblock.
+        * updates, re-read in the superblock and reverify it.
          */
         bp = xfs_getsb(log->l_mp, 0);
         XFS_BUF_UNDONE(bp);
         ASSERT(!(XFS_BUF_ISWRITE(bp)));
         XFS_BUF_READ(bp);
         XFS_BUF_UNASYNC(bp);
+       bp->b_ops = &xfs_sb_buf_ops;
         xfsbdstrat(log->l_mp, bp);
         error = xfs_buf_iowait(bp);
         if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
  
         /* Convert superblock from on-disk format */
         sbp = &log->l_mp->m_sb;
-       xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
+       xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
         ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
         ASSERT(xfs_sb_good_version(sbp));
         xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c

index b2bd3a0e6376e1190e1bbb747435acdaca998bb6..da508463ff1006b7b4b5371d5f2c19495cb5d44b 100644 (file)
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
  #include "xfs_fsops.h"
  #include "xfs_utils.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  
  #ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
  xfs_mount_validate_sb(
         xfs_mount_t     *mp,
         xfs_sb_t        *sbp,
-       int             flags)
+       bool            check_inprogress)
  {
-       int             loud = !(flags & XFS_MFSI_QUIET);
  
         /*
          * If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
          * a volume filesystem in a non-volume manner.
          */
         if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-               if (loud)
-                       xfs_warn(mp, "bad magic number");
+               xfs_warn(mp, "bad magic number");
                 return XFS_ERROR(EWRONGFS);
         }
  
         if (!xfs_sb_good_version(sbp)) {
-               if (loud)
-                       xfs_warn(mp, "bad version");
+               xfs_warn(mp, "bad version");
                 return XFS_ERROR(EWRONGFS);
         }
  
         if (unlikely(
             sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-               if (loud)
-                       xfs_warn(mp,
+               xfs_warn(mp,
                 "filesystem is marked as having an external log; "
                 "specify logdev on the mount command line.");
                 return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
  
         if (unlikely(
             sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-               if (loud)
-                       xfs_warn(mp,
+               xfs_warn(mp,
                 "filesystem is marked as having an internal log; "
                 "do not specify logdev on the mount command line.");
                 return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
             sbp->sb_dblocks == 0                                        ||
             sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
             sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
-               if (loud)
-                       XFS_CORRUPTION_ERROR("SB sanity check failed",
+               XFS_CORRUPTION_ERROR("SB sanity check failed",
                                 XFS_ERRLEVEL_LOW, mp, sbp);
                 return XFS_ERROR(EFSCORRUPTED);
         }
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
          * Until this is fixed only page-sized or smaller data blocks work.
          */
         if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-               if (loud) {
-                       xfs_warn(mp,
+               xfs_warn(mp,
                 "File system with blocksize %d bytes. "
                 "Only pagesize (%ld) or less will currently work.",
                                 sbp->sb_blocksize, PAGE_SIZE);
-               }
                 return XFS_ERROR(ENOSYS);
         }
  
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
         case 2048:
                 break;
         default:
-               if (loud)
-                       xfs_warn(mp, "inode size of %d bytes not supported",
+               xfs_warn(mp, "inode size of %d bytes not supported",
                                 sbp->sb_inodesize);
                 return XFS_ERROR(ENOSYS);
         }
  
         if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
             xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-               if (loud)
-                       xfs_warn(mp,
+               xfs_warn(mp,
                 "file system too large to be mounted on this system.");
                 return XFS_ERROR(EFBIG);
         }
  
-       if (unlikely(sbp->sb_inprogress)) {
-               if (loud)
-                       xfs_warn(mp, "file system busy");
+       if (check_inprogress && sbp->sb_inprogress) {
+               xfs_warn(mp, "Offline file system operation in progress!");
                 return XFS_ERROR(EFSCORRUPTED);
         }
  
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
          * Version 1 directory format has never worked on Linux.
          */
         if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-               if (loud)
-                       xfs_warn(mp,
-                               "file system using version 1 directory format");
+               xfs_warn(mp, "file system using version 1 directory format");
                 return XFS_ERROR(ENOSYS);
         }
  
@@ -520,11 +508,9 @@ out_unwind:
  
  void
  xfs_sb_from_disk(
-       struct xfs_mount        *mp,
+       struct xfs_sb   *to,
         xfs_dsb_t       *from)
  {
-       struct xfs_sb *to = &mp->m_sb;
-
         to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
         to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
         to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
         }
  }
  
+static void
+xfs_sb_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_mount *mp = bp->b_target->bt_mount;
+       struct xfs_sb   sb;
+       int             error;
+
+       xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+       /*
+        * Only check the in progress field for the primary superblock as
+        * mkfs.xfs doesn't clear it from secondary superblocks.
+        */
+       error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+       if (error)
+               xfs_buf_ioerror(bp, error);
+}
+
+static void
+xfs_sb_read_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_sb_verify(bp);
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, the run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+       struct xfs_buf  *bp)
+{
+       struct xfs_sb   sb;
+
+       xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+       if (sb.sb_magicnum == XFS_SB_MAGIC) {
+               /* XFS filesystem, verify noisily! */
+               xfs_sb_read_verify(bp);
+               return;
+       }
+       /* quietly fail */
+       xfs_buf_ioerror(bp, EFSCORRUPTED);
+}
+
+static void
+xfs_sb_write_verify(
+       struct xfs_buf  *bp)
+{
+       xfs_sb_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+       .verify_read = xfs_sb_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+       .verify_read = xfs_sb_quiet_read_verify,
+       .verify_write = xfs_sb_write_verify,
+};
+
  /*
   * xfs_readsb
   *
@@ -651,25 +703,26 @@ xfs_readsb(xfs_mount_t *mp, int flags)
  
  reread:
         bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                       BTOBB(sector_size), 0);
+                                  BTOBB(sector_size), 0,
+                                  loud ? &xfs_sb_buf_ops
+                                       : &xfs_sb_quiet_buf_ops);
         if (!bp) {
                 if (loud)
                         xfs_warn(mp, "SB buffer read failed");
                 return EIO;
         }
-
-       /*
-        * Initialize the mount structure from the superblock.
-        * But first do some basic consistency checking.
-        */
-       xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
-       error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
-       if (error) {
+       if (bp->b_error) {
+               error = bp->b_error;
                 if (loud)
                         xfs_warn(mp, "SB validate failed");
                 goto release_buf;
         }
  
+       /*
+        * Initialize the mount structure from the superblock.
+        */
+       xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+
         /*
          * We must be able to do sector-sized and sector-aligned IO.
          */
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
         }
         bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                         d - XFS_FSS_TO_BB(mp, 1),
-                                       XFS_FSS_TO_BB(mp, 1), 0);
+                                       XFS_FSS_TO_BB(mp, 1), 0, NULL);
         if (!bp) {
                 xfs_warn(mp, "last sector read failed");
                 return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                 }
                 bp = xfs_buf_read_uncached(mp->m_logdev_targp,
                                         d - XFS_FSB_TO_BB(mp, 1),
-                                       XFS_FSB_TO_BB(mp, 1), 0);
+                                       XFS_FSB_TO_BB(mp, 1), 0, NULL);
                 if (!bp) {
                         xfs_warn(mp, "log device read failed");
                         return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
         __uint64_t              resblks;
         int                     error;
  
+       cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
         xfs_qm_unmount_quotas(mp);
         xfs_rtunmount_inodes(mp);
         IRELE(mp->m_rootip);
@@ -1450,20 +1505,15 @@ xfs_unmountfs(
  
         /*
          * And reclaim all inodes.  At this point there should be no dirty
-        * inode, and none should be pinned or locked, but use synchronous
-        * reclaim just to be sure.
+        * inodes and none should be pinned or locked, but use synchronous
+        * reclaim just to be sure. We can stop background inode reclaim
+        * here as well if it is still running.
          */
+       cancel_delayed_work_sync(&mp->m_reclaim_work);
         xfs_reclaim_inodes(mp, SYNC_WAIT);
  
         xfs_qm_unmount(mp);
  
-       /*
-        * Flush out the log synchronously so that we know for sure
-        * that nothing is pinned.  This is important because bflush()
-        * will skip pinned buffers.
-        */
-       xfs_log_force(mp, XFS_LOG_SYNC);
-
         /*
          * Unreserve any blocks we have so that when we unmount we don't account
          * the reserved free space as used. This is really only necessary for
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
                 xfs_warn(mp, "Unable to update superblock counters. "
                                 "Freespace may not be correct on next mount.");
  
-       /*
-        * At this point we might have modified the superblock again and thus
-        * added an item to the AIL, thus flush it again.
-        */
-       xfs_ail_push_all_sync(mp->m_ail);
-       xfs_wait_buftarg(mp->m_ddev_targp);
-
-       /*
-        * The superblock buffer is uncached and xfsaild_push() will lock and
-        * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-        * here but a lock on the superblock buffer will block until iodone()
-        * has completed.
-        */
-       xfs_buf_lock(mp->m_sb_bp);
-       xfs_buf_unlock(mp->m_sb_bp);
-
-       xfs_log_unmount_write(mp);
         xfs_log_unmount(mp);
         xfs_uuid_unmount(mp);
  
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h

index deee09e534dcf35de23535bfa6ce94f6ffa750a6..bab8314507e44177bb86bf8a88c6318da0b46141 100644 (file)
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
  
  #else /* __KERNEL__ */
  
-#include "xfs_sync.h"
-
  struct xlog;
  struct xfs_inode;
  struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
         struct mutex            m_icsb_mutex;   /* balancer sync lock */
  #endif
         struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
-       struct delayed_work     m_sync_work;    /* background sync work */
         struct delayed_work     m_reclaim_work; /* background inode reclaim */
-       struct work_struct      m_flush_work;   /* background inode flush */
+       struct delayed_work     m_eofblocks_work; /* background eof blocks
+                                                    trimming */
         __int64_t               m_update_flags; /* sb flags we need to update
                                                    on the next remount,rw */
         struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
         struct workqueue_struct *m_data_workqueue;
         struct workqueue_struct *m_unwritten_workqueue;
         struct workqueue_struct *m_cil_workqueue;
+       struct workqueue_struct *m_reclaim_workqueue;
+       struct workqueue_struct *m_log_workqueue;
+       struct workqueue_struct *m_eofblocks_workqueue;
  } xfs_mount_t;
  
  /*
@@ -387,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
  extern void    xfs_mod_sb(struct xfs_trans *, __int64_t);
  extern int     xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
                                         xfs_agnumber_t *);
-extern void    xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
+extern void    xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
  extern void    xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
  
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+
  #endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c

index 2e86fa0cfc0d660374b2d8860f7c30e059b94e2b..60eff4763156f9a83a5b5ab49eaca1c1a2df63a8 100644 (file)
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
  #include "xfs_utils.h"
  #include "xfs_qm.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  /*
   * The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
         while (blkcnt--) {
                 error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                               XFS_FSB_TO_DADDR(mp, bno),
-                             mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+                             mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+                             &xfs_dquot_buf_ops);
                 if (error)
                         break;
  
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
                                 while (rablkcnt--) {
                                         xfs_buf_readahead(mp->m_ddev_targp,
                                                XFS_FSB_TO_DADDR(mp, rablkno),
-                                              mp->m_quotainfo->qi_dqchunklen);
+                                              mp->m_quotainfo->qi_dqchunklen,
+                                              NULL);
                                         rablkno++;
                                 }
                         }
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
         int                     error;
  
         if (!xfs_dqlock_nowait(dqp))
-               goto out_busy;
+               goto out_move_tail;
  
         /*
          * This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
          * getting flushed to disk, we don't want to reclaim it.
          */
         if (!xfs_dqflock_nowait(dqp))
-               goto out_busy;
+               goto out_unlock_move_tail;
  
         if (XFS_DQ_IS_DIRTY(dqp)) {
                 struct xfs_buf  *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
                 if (error) {
                         xfs_warn(mp, "%s: dquot %p flush failed",
                                  __func__, dqp);
-                       goto out_busy;
+                       goto out_unlock_move_tail;
                 }
  
                 xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
                  * Give the dquot another try on the freelist, as the
                  * flushing will take some time.
                  */
-               goto out_busy;
+               goto out_unlock_move_tail;
         }
         xfs_dqfunlock(dqp);
  
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
         XFS_STATS_INC(xs_qm_dqreclaims);
         return;
  
-out_busy:
-       xfs_dqunlock(dqp);
-
         /*
          * Move the dquot to the tail of the list so that we don't spin on it.
          */
+out_unlock_move_tail:
+       xfs_dqunlock(dqp);
+out_move_tail:
         list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
-
         trace_xfs_dqreclaim_busy(dqp);
         XFS_STATS_INC(xs_qm_dqreclaim_misses);
  }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c

index 858a3b186110ab74760ebebf0d6b8c1326a9ee5a..5f53e75409b8f45ad919aae17b24cf92c5218a06 100644 (file)
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
  #include "xfs_utils.h"
  #include "xfs_qm.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  STATIC int     xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
  STATIC int     xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -845,7 +846,8 @@ STATIC int
  xfs_dqrele_inode(
         struct xfs_inode        *ip,
         struct xfs_perag        *pag,
-       int                     flags)
+       int                     flags,
+       void                    *args)
  {
         /* skip quota inodes */
         if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
         uint             flags)
  {
         ASSERT(mp->m_quotainfo);
-       xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+       xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
  }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c

index ca28a4ba4b548f0c379291bfb0e716ac3a9ec54e..98dc670d3ee04182da47b27e7db1695b71807434 100644 (file)
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
  #include "xfs_utils.h"
  #include "xfs_trace.h"
  #include "xfs_buf.h"
+#include "xfs_icache.h"
  
  
  /*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
         ASSERT(map.br_startblock != NULLFSBLOCK);
         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                    XFS_FSB_TO_DADDR(mp, map.br_startblock),
-                                  mp->m_bsize, 0, &bp);
+                                  mp->m_bsize, 0, &bp, NULL);
         if (error)
                 return error;
         ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
          */
         bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                 XFS_FSB_TO_BB(mp, nrblocks - 1),
-                               XFS_FSB_TO_BB(mp, 1), 0);
+                               XFS_FSB_TO_BB(mp, 1), 0, NULL);
         if (!bp)
                 return EIO;
+       if (bp->b_error) {
+               error = bp->b_error;
+               xfs_buf_relse(bp);
+               return error;
+       }
         xfs_buf_relse(bp);
  
         /*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
         }
         bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                         d - XFS_FSB_TO_BB(mp, 1),
-                                       XFS_FSB_TO_BB(mp, 1), 0);
-       if (!bp) {
+                                       XFS_FSB_TO_BB(mp, 1), 0, NULL);
+       if (!bp || bp->b_error) {
                 xfs_warn(mp, "realtime device size check failed");
+               if (bp)
+                       xfs_buf_relse(bp);
                 return EIO;
         }
         xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h

index f429d9d5d325d8f1e48f13efda8b1b9a2d4b6f7e..a05b45175fb06d49517175dadc9f644fb79166f5 100644 (file)
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
  #define XFS_SB_VERSION2_ATTR2BIT       0x00000008      /* Inline attr rework */
  #define XFS_SB_VERSION2_PARENTBIT      0x00000010      /* parent pointers */
  #define XFS_SB_VERSION2_PROJID32BIT    0x00000080      /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT         0x00000100      /* metadata CRCs */
  
  #define        XFS_SB_VERSION2_OKREALFBITS     \
         (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
                 (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
  }
  
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+{
+       return (xfs_sb_version_hasmorebits(sbp) &&
+               (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+}
+
  /*
   * end of superblock version macros
   */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c

index 26a09bd7f975f1f344083034b15960e43e4663a6..ab8839b262725dd8a36482aee672288f9f052655 100644 (file)
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
  #include "xfs_extfree_item.h"
  #include "xfs_mru_cache.h"
  #include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
  #include "xfs_trace.h"
  
  #include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
                         WQ_MEM_RECLAIM, 0, mp->m_fsname);
         if (!mp->m_cil_workqueue)
                 goto out_destroy_unwritten;
+
+       mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+       if (!mp->m_reclaim_workqueue)
+               goto out_destroy_cil;
+
+       mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+       if (!mp->m_log_workqueue)
+               goto out_destroy_reclaim;
+
+       mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+                       WQ_NON_REENTRANT, 0, mp->m_fsname);
+       if (!mp->m_eofblocks_workqueue)
+               goto out_destroy_log;
+
         return 0;
  
+out_destroy_log:
+       destroy_workqueue(mp->m_log_workqueue);
+out_destroy_reclaim:
+       destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+       destroy_workqueue(mp->m_cil_workqueue);
  out_destroy_unwritten:
         destroy_workqueue(mp->m_unwritten_workqueue);
  out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
  xfs_destroy_mount_workqueues(
         struct xfs_mount        *mp)
  {
+       destroy_workqueue(mp->m_eofblocks_workqueue);
+       destroy_workqueue(mp->m_log_workqueue);
+       destroy_workqueue(mp->m_reclaim_workqueue);
         destroy_workqueue(mp->m_cil_workqueue);
         destroy_workqueue(mp->m_data_workqueue);
         destroy_workqueue(mp->m_unwritten_workqueue);
  }
  
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+       struct xfs_mount        *mp)
+{
+       struct super_block      *sb = mp->m_super;
+
+       if (down_read_trylock(&sb->s_umount)) {
+               sync_inodes_sb(sb);
+               up_read(&sb->s_umount);
+       }
+}
+
  /* Catch misguided souls that try to use this interface on XFS */
  STATIC struct inode *
  xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
         struct xfs_mount        *mp = XFS_M(sb);
  
         xfs_filestream_unmount(mp);
-       cancel_delayed_work_sync(&mp->m_sync_work);
         xfs_unmountfs(mp);
-       xfs_syncd_stop(mp);
+
         xfs_freesb(mp);
         xfs_icsb_destroy_counters(mp);
         xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
         int                     wait)
  {
         struct xfs_mount        *mp = XFS_M(sb);
-       int                     error;
  
         /*
          * Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
         if (!wait)
                 return 0;
  
-       error = xfs_quiesce_data(mp);
-       if (error)
-               return -error;
-
+       xfs_log_force(mp, XFS_LOG_SYNC);
         if (laptop_mode) {
                 /*
                  * The disk must be active because we're syncing.
-                * We schedule xfssyncd now (now that the disk is
+                * We schedule log work now (now that the disk is
                  * active) instead of later (when it might not be).
                  */
-               flush_delayed_work(&mp->m_sync_work);
+               flush_delayed_work(&mp->m_log->l_work);
         }
  
         return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
         xfs_reserve_blocks(mp, &resblks, NULL);
  }
  
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
+ *
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+       struct xfs_mount        *mp)
+{
+       int     error = 0;
+
+       /* wait for all modifications to complete */
+       while (atomic_read(&mp->m_active_trans) > 0)
+               delay(100);
+
+       /* force the log to unpin objects from the now complete transactions */
+       xfs_log_force(mp, XFS_LOG_SYNC);
+
+       /* reclaim inodes to do any IO before the freeze completes */
+       xfs_reclaim_inodes(mp, 0);
+       xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+       /* Push the superblock and write an unmount record */
+       error = xfs_log_sbcount(mp);
+       if (error)
+               xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+                               "Frozen image may not be consistent.");
+       /*
+        * Just warn here till VFS can correctly support
+        * read-only remount without racing.
+        */
+       WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+       xfs_log_quiesce(mp);
+}
+
  STATIC int
  xfs_fs_remount(
         struct super_block      *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
                  * value if it is non-zero, otherwise go with the default.
                  */
                 xfs_restore_resvblks(mp);
+               xfs_log_work_queue(mp);
         }
  
         /* rw -> ro */
         if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
                 /*
-                * After we have synced the data but before we sync the
-                * metadata, we need to free up the reserve block pool so that
-                * the used block count in the superblock on disk is correct at
-                * the end of the remount. Stash the current reserve pool size
-                * so that if we get remounted rw, we can return it to the same
-                * size.
+                * Before we sync the metadata, we need to free up the reserve
+                * block pool so that the used block count in the superblock on
+                * disk is correct at the end of the remount. Stash the current
+                * reserve pool size so that if we get remounted rw, we can
+                * return it to the same size.
                  */
-
-               xfs_quiesce_data(mp);
                 xfs_save_resvblks(mp);
                 xfs_quiesce_attr(mp);
                 mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
         struct xfs_mount        *mp = XFS_M(sb);
  
         xfs_restore_resvblks(mp);
+       xfs_log_work_queue(mp);
         return 0;
  }
  
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
         spin_lock_init(&mp->m_sb_lock);
         mutex_init(&mp->m_growlock);
         atomic_set(&mp->m_active_trans, 0);
+       INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+       INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
  
         mp->m_super = sb;
         sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
         /*
          * we must configure the block size in the superblock before we run the
          * full mount process as the mount process can lookup and cache inodes.
-        * For the same reason we must also initialise the syncd and register
-        * the inode cache shrinker so that inodes can be reclaimed during
-        * operations like a quotacheck that iterate all inodes in the
-        * filesystem.
          */
         sb->s_magic = XFS_SB_MAGIC;
         sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
         sb->s_time_gran = 1;
         set_posix_acl_flag(sb);
  
-       error = xfs_syncd_init(mp);
-       if (error)
-               goto out_filestream_unmount;
-
         error = xfs_mountfs(mp);
         if (error)
-               goto out_syncd_stop;
+               goto out_filestream_unmount;
  
         root = igrab(VFS_I(mp->m_rootip));
         if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
         }
  
         return 0;
- out_syncd_stop:
-       xfs_syncd_stop(mp);
+
   out_filestream_unmount:
         xfs_filestream_unmount(mp);
   out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
   out_unmount:
         xfs_filestream_unmount(mp);
         xfs_unmountfs(mp);
-       xfs_syncd_stop(mp);
         goto out_free_sb;
  }
  
@@ -1624,16 +1695,6 @@ xfs_destroy_zones(void)
  STATIC int __init
  xfs_init_workqueues(void)
  {
-       /*
-        * We never want to the same work item to run twice, reclaiming inodes
-        * or idling the log is not going to get any faster by multiple CPUs
-        * competing for ressources.  Use the default large max_active value
-        * so that even lots of filesystems can perform these task in parallel.
-        */
-       xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
-       if (!xfs_syncd_wq)
-               return -ENOMEM;
-
         /*
          * The allocation workqueue can be used in memory reclaim situations
          * (writepage path), and parallelism is only limited by the number of
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
          */
         xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
         if (!xfs_alloc_wq)
-               goto out_destroy_syncd;
+               return -ENOMEM;
  
         return 0;
-
-out_destroy_syncd:
-       destroy_workqueue(xfs_syncd_wq);
-       return -ENOMEM;
  }
  
  STATIC void
  xfs_destroy_workqueues(void)
  {
         destroy_workqueue(xfs_alloc_wq);
-       destroy_workqueue(xfs_syncd_wq);
  }
  
  STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h

index 9de4a920ba05962a655c81a87069254ebfbefc28..bbe3d15a7904d6185ab440c3b2a12533322e72a7 100644 (file)
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
  
  extern __uint64_t xfs_max_file_offset(unsigned int);
  
+extern void xfs_flush_inodes(struct xfs_mount *mp);
  extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
  extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
  extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c

deleted file mode 100644 (file)

index 9500caf..0000000
--- a/fs/xfs/xfs_sync.c
+++ /dev/null
@@ -1,973 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_trans_priv.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_dinode.h"
-#include "xfs_error.h"
-#include "xfs_filestream.h"
-#include "xfs_vnodeops.h"
-#include "xfs_inode_item.h"
-#include "xfs_quota.h"
-#include "xfs_trace.h"
-#include "xfs_fsops.h"
-
-#include <linux/kthread.h>
-#include <linux/freezer.h>
-
-struct workqueue_struct        *xfs_syncd_wq;  /* sync workqueue */
-
-/*
- * The inode lookup is done in batches to keep the amount of lock traffic and
- * radix tree lookups to a minimum. The batch size is a trade off between
- * lookup reduction and stack usage. This is in the reclaim path, so we can't
- * be too greedy.
- */
-#define XFS_LOOKUP_BATCH       32
-
-STATIC int
-xfs_inode_ag_walk_grab(
-       struct xfs_inode        *ip)
-{
-       struct inode            *inode = VFS_I(ip);
-
-       ASSERT(rcu_read_lock_held());
-
-       /*
-        * check for stale RCU freed inode
-        *
-        * If the inode has been reallocated, it doesn't matter if it's not in
-        * the AG we are walking - we are walking for writeback, so if it
-        * passes all the "valid inode" checks and is dirty, then we'll write
-        * it back anyway.  If it has been reallocated and still being
-        * initialised, the XFS_INEW check below will catch it.
-        */
-       spin_lock(&ip->i_flags_lock);
-       if (!ip->i_ino)
-               goto out_unlock_noent;
-
-       /* avoid new or reclaimable inodes. Leave for reclaim code to flush */
-       if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
-               goto out_unlock_noent;
-       spin_unlock(&ip->i_flags_lock);
-
-       /* nothing to sync during shutdown */
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-               return EFSCORRUPTED;
-
-       /* If we can't grab the inode, it must on it's way to reclaim. */
-       if (!igrab(inode))
-               return ENOENT;
-
-       if (is_bad_inode(inode)) {
-               IRELE(ip);
-               return ENOENT;
-       }
-
-       /* inode is valid */
-       return 0;
-
-out_unlock_noent:
-       spin_unlock(&ip->i_flags_lock);
-       return ENOENT;
-}
-
-STATIC int
-xfs_inode_ag_walk(
-       struct xfs_mount        *mp,
-       struct xfs_perag        *pag,
-       int                     (*execute)(struct xfs_inode *ip,
-                                          struct xfs_perag *pag, int flags),
-       int                     flags)
-{
-       uint32_t                first_index;
-       int                     last_error = 0;
-       int                     skipped;
-       int                     done;
-       int                     nr_found;
-
-restart:
-       done = 0;
-       skipped = 0;
-       first_index = 0;
-       nr_found = 0;
-       do {
-               struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-               int             error = 0;
-               int             i;
-
-               rcu_read_lock();
-               nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
-                                       (void **)batch, first_index,
-                                       XFS_LOOKUP_BATCH);
-               if (!nr_found) {
-                       rcu_read_unlock();
-                       break;
-               }
-
-               /*
-                * Grab the inodes before we drop the lock. if we found
-                * nothing, nr == 0 and the loop will be skipped.
-                */
-               for (i = 0; i < nr_found; i++) {
-                       struct xfs_inode *ip = batch[i];
-
-                       if (done || xfs_inode_ag_walk_grab(ip))
-                               batch[i] = NULL;
-
-                       /*
-                        * Update the index for the next lookup. Catch
-                        * overflows into the next AG range which can occur if
-                        * we have inodes in the last block of the AG and we
-                        * are currently pointing to the last inode.
-                        *
-                        * Because we may see inodes that are from the wrong AG
-                        * due to RCU freeing and reallocation, only update the
-                        * index if it lies in this AG. It was a race that lead
-                        * us to see this inode, so another lookup from the
-                        * same index will not find it again.
-                        */
-                       if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
-                               continue;
-                       first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                       if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                               done = 1;
-               }
-
-               /* unlock now we've grabbed the inodes. */
-               rcu_read_unlock();
-
-               for (i = 0; i < nr_found; i++) {
-                       if (!batch[i])
-                               continue;
-                       error = execute(batch[i], pag, flags);
-                       IRELE(batch[i]);
-                       if (error == EAGAIN) {
-                               skipped++;
-                               continue;
-                       }
-                       if (error && last_error != EFSCORRUPTED)
-                               last_error = error;
-               }
-
-               /* bail out if the filesystem is corrupted.  */
-               if (error == EFSCORRUPTED)
-                       break;
-
-               cond_resched();
-
-       } while (nr_found && !done);
-
-       if (skipped) {
-               delay(1);
-               goto restart;
-       }
-       return last_error;
-}
-
-int
-xfs_inode_ag_iterator(
-       struct xfs_mount        *mp,
-       int                     (*execute)(struct xfs_inode *ip,
-                                          struct xfs_perag *pag, int flags),
-       int                     flags)
-{
-       struct xfs_perag        *pag;
-       int                     error = 0;
-       int                     last_error = 0;
-       xfs_agnumber_t          ag;
-
-       ag = 0;
-       while ((pag = xfs_perag_get(mp, ag))) {
-               ag = pag->pag_agno + 1;
-               error = xfs_inode_ag_walk(mp, pag, execute, flags);
-               xfs_perag_put(pag);
-               if (error) {
-                       last_error = error;
-                       if (error == EFSCORRUPTED)
-                               break;
-               }
-       }
-       return XFS_ERROR(last_error);
-}
-
-STATIC int
-xfs_sync_inode_data(
-       struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     flags)
-{
-       struct inode            *inode = VFS_I(ip);
-       struct address_space *mapping = inode->i_mapping;
-       int                     error = 0;
-
-       if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-               return 0;
-
-       if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-               if (flags & SYNC_TRYLOCK)
-                       return 0;
-               xfs_ilock(ip, XFS_IOLOCK_SHARED);
-       }
-
-       error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-                               0 : XBF_ASYNC, FI_NONE);
-       xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-       return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-       struct xfs_mount        *mp,
-       int                     flags)
-{
-       int                     error;
-
-       ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
-       error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-       if (error)
-               return XFS_ERROR(error);
-
-       xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-       return 0;
-}
-
-STATIC int
-xfs_sync_fsdata(
-       struct xfs_mount        *mp)
-{
-       struct xfs_buf          *bp;
-       int                     error;
-
-       /*
-        * If the buffer is pinned then push on the log so we won't get stuck
-        * waiting in the write for someone, maybe ourselves, to flush the log.
-        *
-        * Even though we just pushed the log above, we did not have the
-        * superblock buffer locked at that point so it can become pinned in
-        * between there and here.
-        */
-       bp = xfs_getsb(mp, 0);
-       if (xfs_buf_ispinned(bp))
-               xfs_log_force(mp, 0);
-       error = xfs_bwrite(bp);
-       xfs_buf_relse(bp);
-       return error;
-}
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete.  Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
-int
-xfs_quiesce_data(
-       struct xfs_mount        *mp)
-{
-       int                     error, error2 = 0;
-
-       /* force out the log */
-       xfs_log_force(mp, XFS_LOG_SYNC);
-
-       /* write superblock and hoover up shutdown errors */
-       error = xfs_sync_fsdata(mp);
-
-       /* mark the log as covered if needed */
-       if (xfs_log_need_covered(mp))
-               error2 = xfs_fs_log_dummy(mp);
-
-       return error ? error : error2;
-}
-
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- */
-void
-xfs_quiesce_attr(
-       struct xfs_mount        *mp)
-{
-       int     error = 0;
-
-       /* wait for all modifications to complete */
-       while (atomic_read(&mp->m_active_trans) > 0)
-               delay(100);
-
-       /* reclaim inodes to do any IO before the freeze completes */
-       xfs_reclaim_inodes(mp, 0);
-       xfs_reclaim_inodes(mp, SYNC_WAIT);
-
-       /* flush all pending changes from the AIL */
-       xfs_ail_push_all_sync(mp->m_ail);
-
-       /*
-        * Just warn here till VFS can correctly support
-        * read-only remount without racing.
-        */
-       WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
-       /* Push the superblock and write an unmount record */
-       error = xfs_log_sbcount(mp);
-       if (error)
-               xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                               "Frozen image may not be consistent.");
-       xfs_log_unmount_write(mp);
-
-       /*
-        * At this point we might have modified the superblock again and thus
-        * added an item to the AIL, thus flush it again.
-        */
-       xfs_ail_push_all_sync(mp->m_ail);
-
-       /*
-        * The superblock buffer is uncached and xfsaild_push() will lock and
-        * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-        * here but a lock on the superblock buffer will block until iodone()
-        * has completed.
-        */
-       xfs_buf_lock(mp->m_sb_bp);
-       xfs_buf_unlock(mp->m_sb_bp);
-}
-
-static void
-xfs_syncd_queue_sync(
-       struct xfs_mount        *mp)
-{
-       queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-                               msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-
-/*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
- */
-STATIC void
-xfs_sync_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_sync_work);
-       int             error;
-
-       /*
-        * We shouldn't write/force the log if we are in the mount/unmount
-        * process or on a read only filesystem. The workqueue still needs to be
-        * active in both cases, however, because it is used for inode reclaim
-        * during these times.  Use the MS_ACTIVE flag to avoid doing anything
-        * during mount.  Doing work during unmount is avoided by calling
-        * cancel_delayed_work_sync on this work queue before tearing down
-        * the ail and the log in xfs_log_unmount.
-        */
-       if (!(mp->m_super->s_flags & MS_ACTIVE) &&
-           !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-               /* dgc: errors ignored here */
-               if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-                   xfs_log_need_covered(mp))
-                       error = xfs_fs_log_dummy(mp);
-               else
-                       xfs_log_force(mp, 0);
-
-               /* start pushing all the metadata that is currently
-                * dirty */
-               xfs_ail_push_all(mp->m_ail);
-       }
-
-       /* queue us up again */
-       xfs_syncd_queue_sync(mp);
-}
-
-/*
- * Queue a new inode reclaim pass if there are reclaimable inodes and there
- * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
- * tunable, but that can be done if this method proves to be ineffective or too
- * aggressive.
- */
-static void
-xfs_syncd_queue_reclaim(
-       struct xfs_mount        *mp)
-{
-
-       rcu_read_lock();
-       if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-               queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
-                       msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
-       }
-       rcu_read_unlock();
-}
-
-/*
- * This is a fast pass over the inode cache to try to get reclaim moving on as
- * many inodes as possible in a short period of time. It kicks itself every few
- * seconds, as well as being kicked by the inode cache shrinker when memory
- * goes low. It scans as quickly as possible avoiding locked inodes or those
- * already being flushed, and once done schedules a future pass.
- */
-STATIC void
-xfs_reclaim_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(to_delayed_work(work),
-                                       struct xfs_mount, m_reclaim_work);
-
-       xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-       xfs_syncd_queue_reclaim(mp);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-       struct xfs_inode        *ip)
-{
-       struct xfs_mount        *mp = ip->i_mount;
-
-       queue_work(xfs_syncd_wq, &mp->m_flush_work);
-       flush_work(&mp->m_flush_work);
-}
-
-STATIC void
-xfs_flush_worker(
-       struct work_struct *work)
-{
-       struct xfs_mount *mp = container_of(work,
-                                       struct xfs_mount, m_flush_work);
-
-       xfs_sync_data(mp, SYNC_TRYLOCK);
-       xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
-int
-xfs_syncd_init(
-       struct xfs_mount        *mp)
-{
-       INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-       INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-       INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
-       xfs_syncd_queue_sync(mp);
-
-       return 0;
-}
-
-void
-xfs_syncd_stop(
-       struct xfs_mount        *mp)
-{
-       cancel_delayed_work_sync(&mp->m_sync_work);
-       cancel_delayed_work_sync(&mp->m_reclaim_work);
-       cancel_work_sync(&mp->m_flush_work);
-}
-
-void
-__xfs_inode_set_reclaim_tag(
-       struct xfs_perag        *pag,
-       struct xfs_inode        *ip)
-{
-       radix_tree_tag_set(&pag->pag_ici_root,
-                          XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-                          XFS_ICI_RECLAIM_TAG);
-
-       if (!pag->pag_ici_reclaimable) {
-               /* propagate the reclaim tag up into the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_set(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-
-               /* schedule periodic background inode reclaim */
-               xfs_syncd_queue_reclaim(ip->i_mount);
-
-               trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-       pag->pag_ici_reclaimable++;
-}
-
-/*
- * We set the inode flag atomically with the radix tree tag.
- * Once we get tag lookups on the radix tree, this inode flag
- * can go away.
- */
-void
-xfs_inode_set_reclaim_tag(
-       xfs_inode_t     *ip)
-{
-       struct xfs_mount *mp = ip->i_mount;
-       struct xfs_perag *pag;
-
-       pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-       spin_lock(&pag->pag_ici_lock);
-       spin_lock(&ip->i_flags_lock);
-       __xfs_inode_set_reclaim_tag(pag, ip);
-       __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
-       spin_unlock(&ip->i_flags_lock);
-       spin_unlock(&pag->pag_ici_lock);
-       xfs_perag_put(pag);
-}
-
-STATIC void
-__xfs_inode_clear_reclaim(
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       pag->pag_ici_reclaimable--;
-       if (!pag->pag_ici_reclaimable) {
-               /* clear the reclaim tag from the perag radix tree */
-               spin_lock(&ip->i_mount->m_perag_lock);
-               radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
-                               XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-                               XFS_ICI_RECLAIM_TAG);
-               spin_unlock(&ip->i_mount->m_perag_lock);
-               trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno,
-                                                       -1, _RET_IP_);
-       }
-}
-
-void
-__xfs_inode_clear_reclaim_tag(
-       xfs_mount_t     *mp,
-       xfs_perag_t     *pag,
-       xfs_inode_t     *ip)
-{
-       radix_tree_tag_clear(&pag->pag_ici_root,
-                       XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
-       __xfs_inode_clear_reclaim(pag, ip);
-}
-
-/*
- * Grab the inode for reclaim exclusively.
- * Return 0 if we grabbed it, non-zero otherwise.
- */
-STATIC int
-xfs_reclaim_inode_grab(
-       struct xfs_inode        *ip,
-       int                     flags)
-{
-       ASSERT(rcu_read_lock_held());
-
-       /* quick check for stale RCU freed inode */
-       if (!ip->i_ino)
-               return 1;
-
-       /*
-        * If we are asked for non-blocking operation, do unlocked checks to
-        * see if the inode already is being flushed or in reclaim to avoid
-        * lock traffic.
-        */
-       if ((flags & SYNC_TRYLOCK) &&
-           __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM))
-               return 1;
-
-       /*
-        * The radix tree lock here protects a thread in xfs_iget from racing
-        * with us starting reclaim on the inode.  Once we have the
-        * XFS_IRECLAIM flag set it will not touch us.
-        *
-        * Due to RCU lookup, we may find inodes that have been freed and only
-        * have XFS_IRECLAIM set.  Indeed, we may see reallocated inodes that
-        * aren't candidates for reclaim at all, so we must check the
-        * XFS_IRECLAIMABLE is set first before proceeding to reclaim.
-        */
-       spin_lock(&ip->i_flags_lock);
-       if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
-           __xfs_iflags_test(ip, XFS_IRECLAIM)) {
-               /* not a reclaim candidate. */
-               spin_unlock(&ip->i_flags_lock);
-               return 1;
-       }
-       __xfs_iflags_set(ip, XFS_IRECLAIM);
-       spin_unlock(&ip->i_flags_lock);
-       return 0;
-}
-
-/*
- * Inodes in different states need to be treated differently. The following
- * table lists the inode states and the reclaim actions necessary:
- *
- *     inode state          iflush ret         required action
- *      ---------------      ----------         ---------------
- *     bad                     -               reclaim
- *     shutdown                EIO             unpin and reclaim
- *     clean, unpinned         0               reclaim
- *     stale, unpinned         0               reclaim
- *     clean, pinned(*)        0               requeue
- *     stale, pinned           EAGAIN          requeue
- *     dirty, async            -               requeue
- *     dirty, sync             0               reclaim
- *
- * (*) dgc: I don't think the clean, pinned state is possible but it gets
- * handled anyway given the order of checks implemented.
- *
- * Also, because we get the flush lock first, we know that any inode that has
- * been flushed delwri has had the flush completed by the time we check that
- * the inode is clean.
- *
- * Note that because the inode is flushed delayed write by AIL pushing, the
- * flush lock may already be held here and waiting on it can result in very
- * long latencies.  Hence for sync reclaims, where we wait on the flush lock,
- * the caller should push the AIL first before trying to reclaim inodes to
- * minimise the amount of time spent waiting.  For background relaim, we only
- * bother to reclaim clean inodes anyway.
- *
- * Hence the order of actions after gaining the locks should be:
- *     bad             => reclaim
- *     shutdown        => unpin and reclaim
- *     pinned, async   => requeue
- *     pinned, sync    => unpin
- *     stale           => reclaim
- *     clean           => reclaim
- *     dirty, async    => requeue
- *     dirty, sync     => flush, wait and reclaim
- */
-STATIC int
-xfs_reclaim_inode(
-       struct xfs_inode        *ip,
-       struct xfs_perag        *pag,
-       int                     sync_mode)
-{
-       struct xfs_buf          *bp = NULL;
-       int                     error;
-
-restart:
-       error = 0;
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       if (!xfs_iflock_nowait(ip)) {
-               if (!(sync_mode & SYNC_WAIT))
-                       goto out;
-               xfs_iflock(ip);
-       }
-
-       if (is_bad_inode(VFS_I(ip)))
-               goto reclaim;
-       if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
-               xfs_iunpin_wait(ip);
-               xfs_iflush_abort(ip, false);
-               goto reclaim;
-       }
-       if (xfs_ipincount(ip)) {
-               if (!(sync_mode & SYNC_WAIT))
-                       goto out_ifunlock;
-               xfs_iunpin_wait(ip);
-       }
-       if (xfs_iflags_test(ip, XFS_ISTALE))
-               goto reclaim;
-       if (xfs_inode_clean(ip))
-               goto reclaim;
-
-       /*
-        * Never flush out dirty data during non-blocking reclaim, as it would
-        * just contend with AIL pushing trying to do the same job.
-        */
-       if (!(sync_mode & SYNC_WAIT))
-               goto out_ifunlock;
-
-       /*
-        * Now we have an inode that needs flushing.
-        *
-        * Note that xfs_iflush will never block on the inode buffer lock, as
-        * xfs_ifree_cluster() can lock the inode buffer before it locks the
-        * ip->i_lock, and we are doing the exact opposite here.  As a result,
-        * doing a blocking xfs_imap_to_bp() to get the cluster buffer would
-        * result in an ABBA deadlock with xfs_ifree_cluster().
-        *
-        * As xfs_ifree_cluser() must gather all inodes that are active in the
-        * cache to mark them stale, if we hit this case we don't actually want
-        * to do IO here - we want the inode marked stale so we can simply
-        * reclaim it.  Hence if we get an EAGAIN error here,  just unlock the
-        * inode, back off and try again.  Hopefully the next pass through will
-        * see the stale flag set on the inode.
-        */
-       error = xfs_iflush(ip, &bp);
-       if (error == EAGAIN) {
-               xfs_iunlock(ip, XFS_ILOCK_EXCL);
-               /* backoff longer than in xfs_ifree_cluster */
-               delay(2);
-               goto restart;
-       }
-
-       if (!error) {
-               error = xfs_bwrite(bp);
-               xfs_buf_relse(bp);
-       }
-
-       xfs_iflock(ip);
-reclaim:
-       xfs_ifunlock(ip);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-       XFS_STATS_INC(xs_ig_reclaims);
-       /*
-        * Remove the inode from the per-AG radix tree.
-        *
-        * Because radix_tree_delete won't complain even if the item was never
-        * added to the tree assert that it's been there before to catch
-        * problems with the inode life time early on.
-        */
-       spin_lock(&pag->pag_ici_lock);
-       if (!radix_tree_delete(&pag->pag_ici_root,
-                               XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino)))
-               ASSERT(0);
-       __xfs_inode_clear_reclaim(pag, ip);
-       spin_unlock(&pag->pag_ici_lock);
-
-       /*
-        * Here we do an (almost) spurious inode lock in order to coordinate
-        * with inode cache radix tree lookups.  This is because the lookup
-        * can reference the inodes in the cache without taking references.
-        *
-        * We make that OK here by ensuring that we wait until the inode is
-        * unlocked after the lookup before we go ahead and free it.
-        */
-       xfs_ilock(ip, XFS_ILOCK_EXCL);
-       xfs_qm_dqdetach(ip);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
-       xfs_inode_free(ip);
-       return error;
-
-out_ifunlock:
-       xfs_ifunlock(ip);
-out:
-       xfs_iflags_clear(ip, XFS_IRECLAIM);
-       xfs_iunlock(ip, XFS_ILOCK_EXCL);
-       /*
-        * We could return EAGAIN here to make reclaim rescan the inode tree in
-        * a short while. However, this just burns CPU time scanning the tree
-        * waiting for IO to complete and xfssyncd never goes back to the idle
-        * state. Instead, return 0 to let the next scheduled background reclaim
-        * attempt to reclaim the inode again.
-        */
-       return 0;
-}
-
-/*
- * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
- * corrupted, we still want to try to reclaim all the inodes. If we don't,
- * then a shut down during filesystem unmount reclaim walk leak all the
- * unreclaimed inodes.
- */
-int
-xfs_reclaim_inodes_ag(
-       struct xfs_mount        *mp,
-       int                     flags,
-       int                     *nr_to_scan)
-{
-       struct xfs_perag        *pag;
-       int                     error = 0;
-       int                     last_error = 0;
-       xfs_agnumber_t          ag;
-       int                     trylock = flags & SYNC_TRYLOCK;
-       int                     skipped;
-
-restart:
-       ag = 0;
-       skipped = 0;
-       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-               unsigned long   first_index = 0;
-               int             done = 0;
-               int             nr_found = 0;
-
-               ag = pag->pag_agno + 1;
-
-               if (trylock) {
-                       if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) {
-                               skipped++;
-                               xfs_perag_put(pag);
-                               continue;
-                       }
-                       first_index = pag->pag_ici_reclaim_cursor;
-               } else
-                       mutex_lock(&pag->pag_ici_reclaim_lock);
-
-               do {
-                       struct xfs_inode *batch[XFS_LOOKUP_BATCH];
-                       int     i;
-
-                       rcu_read_lock();
-                       nr_found = radix_tree_gang_lookup_tag(
-                                       &pag->pag_ici_root,
-                                       (void **)batch, first_index,
-                                       XFS_LOOKUP_BATCH,
-                                       XFS_ICI_RECLAIM_TAG);
-                       if (!nr_found) {
-                               done = 1;
-                               rcu_read_unlock();
-                               break;
-                       }
-
-                       /*
-                        * Grab the inodes before we drop the lock. if we found
-                        * nothing, nr == 0 and the loop will be skipped.
-                        */
-                       for (i = 0; i < nr_found; i++) {
-                               struct xfs_inode *ip = batch[i];
-
-                               if (done || xfs_reclaim_inode_grab(ip, flags))
-                                       batch[i] = NULL;
-
-                               /*
-                                * Update the index for the next lookup. Catch
-                                * overflows into the next AG range which can
-                                * occur if we have inodes in the last block of
-                                * the AG and we are currently pointing to the
-                                * last inode.
-                                *
-                                * Because we may see inodes that are from the
-                                * wrong AG due to RCU freeing and
-                                * reallocation, only update the index if it
-                                * lies in this AG. It was a race that lead us
-                                * to see this inode, so another lookup from
-                                * the same index will not find it again.
-                                */
-                               if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
-                                                               pag->pag_agno)
-                                       continue;
-                               first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
-                               if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
-                                       done = 1;
-                       }
-
-                       /* unlock now we've grabbed the inodes. */
-                       rcu_read_unlock();
-
-                       for (i = 0; i < nr_found; i++) {
-                               if (!batch[i])
-                                       continue;
-                               error = xfs_reclaim_inode(batch[i], pag, flags);
-                               if (error && last_error != EFSCORRUPTED)
-                                       last_error = error;
-                       }
-
-                       *nr_to_scan -= XFS_LOOKUP_BATCH;
-
-                       cond_resched();
-
-               } while (nr_found && !done && *nr_to_scan > 0);
-
-               if (trylock && !done)
-                       pag->pag_ici_reclaim_cursor = first_index;
-               else
-                       pag->pag_ici_reclaim_cursor = 0;
-               mutex_unlock(&pag->pag_ici_reclaim_lock);
-               xfs_perag_put(pag);
-       }
-
-       /*
-        * if we skipped any AG, and we still have scan count remaining, do
-        * another pass this time using blocking reclaim semantics (i.e
-        * waiting on the reclaim locks and ignoring the reclaim cursors). This
-        * ensure that when we get more reclaimers than AGs we block rather
-        * than spin trying to execute reclaim.
-        */
-       if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) {
-               trylock = 0;
-               goto restart;
-       }
-       return XFS_ERROR(last_error);
-}
-
-int
-xfs_reclaim_inodes(
-       xfs_mount_t     *mp,
-       int             mode)
-{
-       int             nr_to_scan = INT_MAX;
-
-       return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan);
-}
-
-/*
- * Scan a certain number of inodes for reclaim.
- *
- * When called we make sure that there is a background (fast) inode reclaim in
- * progress, while we will throttle the speed of reclaim via doing synchronous
- * reclaim of inodes. That means if we come across dirty inodes, we wait for
- * them to be cleaned, which we hope will not be very long due to the
- * background walker having already kicked the IO off on those dirty inodes.
- */
-void
-xfs_reclaim_inodes_nr(
-       struct xfs_mount        *mp,
-       int                     nr_to_scan)
-{
-       /* kick background reclaimer and push the AIL */
-       xfs_syncd_queue_reclaim(mp);
-       xfs_ail_push_all(mp->m_ail);
-
-       xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
-}
-
-/*
- * Return the number of reclaimable inodes in the filesystem for
- * the shrinker to determine how much to reclaim.
- */
-int
-xfs_reclaim_inodes_count(
-       struct xfs_mount        *mp)
-{
-       struct xfs_perag        *pag;
-       xfs_agnumber_t          ag = 0;
-       int                     reclaimable = 0;
-
-       while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
-               ag = pag->pag_agno + 1;
-               reclaimable += pag->pag_ici_reclaimable;
-               xfs_perag_put(pag);
-       }
-       return reclaimable;
-}
-
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h

deleted file mode 100644 (file)

index 941202e..0000000
--- a/fs/xfs/xfs_sync.h
+++ /dev/null
@@ -1,51 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#ifndef XFS_SYNC_H
-#define XFS_SYNC_H 1
-
-struct xfs_mount;
-struct xfs_perag;
-
-#define SYNC_WAIT              0x0001  /* wait for i/o to complete */
-#define SYNC_TRYLOCK           0x0002  /* only try to lock inodes */
-
-extern struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
-
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
-
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-
-void xfs_flush_inodes(struct xfs_inode *ip);
-
-int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
-int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
-
-void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
-                               struct xfs_inode *ip);
-
-int xfs_sync_inode_grab(struct xfs_inode *ip);
-int xfs_inode_ag_iterator(struct xfs_mount *mp,
-       int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
-       int flags);
-
-#endif
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c

index ee2d2adaa438121a1c875a5a7dda6b96f16c20c0..2801b5ce6cdb61d9ba03a1b518561a64924a7068 100644 (file)
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
                 .extra1         = &xfs_params.fstrm_timer.min,
                 .extra2         = &xfs_params.fstrm_timer.max,
         },
+       {
+               .procname       = "speculative_prealloc_lifetime",
+               .data           = &xfs_params.eofb_timer.val,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = &xfs_params.eofb_timer.min,
+               .extra2         = &xfs_params.eofb_timer.max,
+       },
         /* please keep this the last entry */
  #ifdef CONFIG_PROC_FS
         {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h

index b9937d450f8e6d72e01e186c09f446c791028ef2..bd8e157c20efa254c736952967e6c894dbd706a0 100644 (file)
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
         xfs_sysctl_val_t rotorstep;     /* inode32 AG rotoring control knob */
         xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
         xfs_sysctl_val_t fstrm_timer;   /* Filestream dir-AG assoc'n timeout. */
+       xfs_sysctl_val_t eofb_timer;    /* Interval between eofb scan wakeups */
  } xfs_param_t;
  
  /*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h

index 7d36ccf57f93236c5d8228c158b3ff5ba0886672..2e137d4a85ae66bc3a9172a4d93f734ba7aabb99 100644 (file)
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
  DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
  DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
  DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
  
  DECLARE_EVENT_CLASS(xfs_perag_class,
         TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
  DEFINE_PERAG_REF_EVENT(xfs_perag_put);
  DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
  DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
  
  TRACE_EVENT(xfs_attr_list_node_descend,
         TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
  DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
  DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
  
+DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+
  DECLARE_EVENT_CLASS(xfs_iref_class,
         TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
         TP_ARGS(ip, caller_ip),
@@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
  DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
  DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
  
+DECLARE_EVENT_CLASS(xfs_attr_class,
+       TP_PROTO(struct xfs_da_args *args),
+       TP_ARGS(args),
+       TP_STRUCT__entry(
+               __field(dev_t, dev)
+               __field(xfs_ino_t, ino)
+               __dynamic_array(char, name, args->namelen)
+               __field(int, namelen)
+               __field(int, valuelen)
+               __field(xfs_dahash_t, hashval)
+               __field(int, op_flags)
+       ),
+       TP_fast_assign(
+               __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+               __entry->ino = args->dp->i_ino;
+               if (args->namelen)
+                       memcpy(__get_str(name), args->name, args->namelen);
+               __entry->namelen = args->namelen;
+               __entry->valuelen = args->valuelen;
+               __entry->hashval = args->hashval;
+               __entry->op_flags = args->op_flags;
+       ),
+       TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
+                 "hashval 0x%x op_flags %s",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->ino,
+                 __entry->namelen,
+                 __entry->namelen ? __get_str(name) : NULL,
+                 __entry->namelen,
+                 __entry->valuelen,
+                 __entry->hashval,
+                 __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
  #define DEFINE_ATTR_EVENT(name) \
-DEFINE_EVENT(xfs_da_class, name, \
+DEFINE_EVENT(xfs_attr_class, name, \
         TP_PROTO(struct xfs_da_args *args), \
         TP_ARGS(args))
  DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
  DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
  
  DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_get);
  DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
  DEFINE_ATTR_EVENT(xfs_attr_node_replace);
  DEFINE_ATTR_EVENT(xfs_attr_node_removename);
  
+DEFINE_ATTR_EVENT(xfs_attr_fillstate);
+DEFINE_ATTR_EVENT(xfs_attr_refillstate);
+
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
+
  #define DEFINE_DA_EVENT(name) \
  DEFINE_EVENT(xfs_da_class, name, \
         TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
  DEFINE_DA_EVENT(xfs_da_node_remove);
  DEFINE_DA_EVENT(xfs_da_node_rebalance);
  DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_node_toosmall);
  DEFINE_DA_EVENT(xfs_da_swap_lastblock);
  DEFINE_DA_EVENT(xfs_da_grow_inode);
  DEFINE_DA_EVENT(xfs_da_shrink_inode);
+DEFINE_DA_EVENT(xfs_da_fixhashpath);
+DEFINE_DA_EVENT(xfs_da_path_shift);
  
  DECLARE_EVENT_CLASS(xfs_dir2_space_class,
         TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h

index db056544cbb5ecaa2360d9ac1319f171fbbe1e27..c6c0601abd7a6b17f7a40e99fe7052dcd679baf0 100644 (file)
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
         int                     numblks,
         uint                    flags)
  {
-       struct xfs_buf_map      map = {
-               .bm_bn = blkno,
-               .bm_len = numblks,
-       };
+       DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
         return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
  }
  
@@ -476,7 +473,8 @@ int         xfs_trans_read_buf_map(struct xfs_mount *mp,
                                        struct xfs_buftarg *target,
                                        struct xfs_buf_map *map, int nmaps,
                                        xfs_buf_flags_t flags,
-                                      struct xfs_buf **bpp);
+                                      struct xfs_buf **bpp,
+                                      const struct xfs_buf_ops *ops);
  
  static inline int
  xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
         xfs_daddr_t             blkno,
         int                     numblks,
         xfs_buf_flags_t         flags,
-       struct xfs_buf          **bpp)
+       struct xfs_buf          **bpp,
+       const struct xfs_buf_ops *ops)
  {
-       struct xfs_buf_map      map = {
-               .bm_bn = blkno,
-               .bm_len = numblks,
-       };
-       return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
+       DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+       return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
+                                     flags, bpp, ops);
  }
  
  struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c

index 6311b99c267f69fe3ac1e7cc4d72a7681e96415d..4fc17d479d42301d13cc86bfd2a8aec194fecea0 100644 (file)
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
         struct xfs_buf_map      *map,
         int                     nmaps,
         xfs_buf_flags_t         flags,
-       struct xfs_buf          **bpp)
+       struct xfs_buf          **bpp,
+       const struct xfs_buf_ops *ops)
  {
         xfs_buf_t               *bp;
         xfs_buf_log_item_t      *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
  
         *bpp = NULL;
         if (!tp) {
-               bp = xfs_buf_read_map(target, map, nmaps, flags);
+               bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
                 if (!bp)
                         return (flags & XBF_TRYLOCK) ?
                                         EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
                 if (!(XFS_BUF_ISDONE(bp))) {
                         trace_xfs_trans_read_buf_io(bp, _RET_IP_);
                         ASSERT(!XFS_BUF_ISASYNC(bp));
+                       ASSERT(bp->b_iodone == NULL);
                         XFS_BUF_READ(bp);
+                       bp->b_ops = ops;
                         xfsbdstrat(tp->t_mountp, bp);
                         error = xfs_buf_iowait(bp);
                         if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
                 return 0;
         }
  
-       bp = xfs_buf_read_map(target, map, nmaps, flags);
+       bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
         if (bp == NULL) {
                 *bpp = NULL;
                 return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c

index 2a5c637344b4f310cd28586eec082d38176cbfcb..d95f565a390e8bc006b708b734b6107620a8d0aa 100644 (file)
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
  #include "xfs_filestream.h"
  #include "xfs_vnodeops.h"
  #include "xfs_trace.h"
+#include "xfs_icache.h"
  
  /*
   * The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
                 d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                 byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
  
-               bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+               bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
                 if (!bp)
                         return XFS_ERROR(ENOMEM);
                 error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
   * when the link count isn't zero and by xfs_dm_punch_hole() when
   * punching a hole to EOF.
   */
-STATIC int
+int
  xfs_free_eofblocks(
         xfs_mount_t     *mp,
         xfs_inode_t     *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
                 if (need_iolock) {
                         if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
                                 xfs_trans_cancel(tp, 0);
-                               return 0;
+                               return EAGAIN;
                         }
                 }
  
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
                 } else {
                         error = xfs_trans_commit(tp,
                                                 XFS_TRANS_RELEASE_LOG_RES);
+                       if (!error)
+                               xfs_inode_clear_eofblocks_tag(ip);
                 }
  
                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
                 truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
                 if (truncated) {
                         xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
-                               xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+                       if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+                               error = -filemap_flush(VFS_I(ip)->i_mapping);
+                               if (error)
+                                       return error;
+                       }
                 }
         }
  
         if (ip->i_d.di_nlink == 0)
                 return 0;
  
-       if ((S_ISREG(ip->i_d.di_mode) &&
-            (VFS_I(ip)->i_size > 0 ||
-             (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-            (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-           (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+       if (xfs_can_free_eofblocks(ip, false)) {
  
                 /*
                  * If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
                         return 0;
  
                 error = xfs_free_eofblocks(mp, ip, true);
-               if (error)
+               if (error && error != EAGAIN)
                         return error;
  
                 /* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
                 goto out;
  
         if (ip->i_d.di_nlink != 0) {
-               if ((S_ISREG(ip->i_d.di_mode) &&
-                   (VFS_I(ip)->i_size > 0 ||
-                    (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-                   (ip->i_df.if_flags & XFS_IFEXTENTS) &&
-                   (!(ip->i_d.di_flags &
-                               (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
-                    ip->i_delayed_blks != 0))) {
+               /*
+                * force is true because we are evicting an inode from the
+                * cache. Post-eof blocks must be freed, lest we end up with
+                * broken free space accounting.
+                */
+               if (xfs_can_free_eofblocks(ip, true)) {
                         error = xfs_free_eofblocks(mp, ip, false);
                         if (error)
                                 return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
                         XFS_TRANS_PERM_LOG_RES, log_count);
         if (error == ENOSPC) {
                 /* flush outstanding delalloc blocks and retry */
-               xfs_flush_inodes(dp);
+               xfs_flush_inodes(mp);
                 error = xfs_trans_reserve(tp, resblks, log_res, 0,
                                 XFS_TRANS_PERM_LOG_RES, log_count);
         }
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
  
         rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
         ioffset = offset & ~(rounding - 1);
-
-       if (VN_CACHED(VFS_I(ip)) != 0) {
-               error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
-               if (error)
-                       goto out_unlock_iolock;
-       }
+       error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+                                             ioffset, -1);
+       if (error)
+               goto out_unlock_iolock;
+       truncate_pagecache_range(VFS_I(ip), ioffset, -1);
  
         /*
          * Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
         return error;
  }
  
+
+STATIC int
+xfs_zero_file_space(
+       struct xfs_inode        *ip,
+       xfs_off_t               offset,
+       xfs_off_t               len,
+       int                     attr_flags)
+{
+       struct xfs_mount        *mp = ip->i_mount;
+       uint                    granularity;
+       xfs_off_t               start_boundary;
+       xfs_off_t               end_boundary;
+       int                     error;
+
+       granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+
+       /*
+        * Round the range of extents we are going to convert inwards.  If the
+        * offset is aligned, then it doesn't get changed so we zero from the
+        * start of the block offset points to.
+        */
+       start_boundary = round_up(offset, granularity);
+       end_boundary = round_down(offset + len, granularity);
+
+       ASSERT(start_boundary >= offset);
+       ASSERT(end_boundary <= offset + len);
+
+       if (!(attr_flags & XFS_ATTR_NOLOCK))
+               xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+       if (start_boundary < end_boundary - 1) {
+               /* punch out the page cache over the conversion range */
+               truncate_pagecache_range(VFS_I(ip), start_boundary,
+                                        end_boundary - 1);
+               /* convert the blocks */
+               error = xfs_alloc_file_space(ip, start_boundary,
+                                       end_boundary - start_boundary - 1,
+                                       XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
+                                       attr_flags);
+               if (error)
+                       goto out_unlock;
+
+               /* We've handled the interior of the range, now for the edges */
+               if (start_boundary != offset)
+                       error = xfs_iozero(ip, offset, start_boundary - offset);
+               if (error)
+                       goto out_unlock;
+
+               if (end_boundary != offset + len)
+                       error = xfs_iozero(ip, end_boundary,
+                                          offset + len - end_boundary);
+
+       } else {
+               /*
+                * It's either a sub-granularity range or the range spanned lies
+                * partially across two adjacent blocks.
+                */
+               error = xfs_iozero(ip, offset, len);
+       }
+
+out_unlock:
+       if (!(attr_flags & XFS_ATTR_NOLOCK))
+               xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+       return error;
+
+}
+
  /*
   * xfs_change_file_space()
   *      This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
         xfs_fsize_t     fsize;
         int             setprealloc;
         xfs_off_t       startoffset;
-       xfs_off_t       llen;
         xfs_trans_t     *tp;
         struct iattr    iattr;
-       int             prealloc_type;
  
         if (!S_ISREG(ip->i_d.di_mode))
                 return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
                 return XFS_ERROR(EINVAL);
         }
  
-       llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+       /*
+        * length of <= 0 for resv/unresv/zero is invalid.  length for
+        * alloc/free is ignored completely and we have no idea what userspace
+        * might have set it to, so set it to zero to allow range
+        * checks to pass.
+        */
+       switch (cmd) {
+       case XFS_IOC_ZERO_RANGE:
+       case XFS_IOC_RESVSP:
+       case XFS_IOC_RESVSP64:
+       case XFS_IOC_UNRESVSP:
+       case XFS_IOC_UNRESVSP64:
+               if (bf->l_len <= 0)
+                       return XFS_ERROR(EINVAL);
+               break;
+       default:
+               bf->l_len = 0;
+               break;
+       }
  
         if (bf->l_start < 0 ||
             bf->l_start > mp->m_super->s_maxbytes ||
-           bf->l_start + llen < 0 ||
-           bf->l_start + llen > mp->m_super->s_maxbytes)
+           bf->l_start + bf->l_len < 0 ||
+           bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
                 return XFS_ERROR(EINVAL);
  
         bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
         startoffset = bf->l_start;
         fsize = XFS_ISIZE(ip);
  
-       /*
-        * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
-        * file space.
-        * These calls do NOT zero the data space allocated to the file,
-        * nor do they change the file size.
-        *
-        * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
-        * space.
-        * These calls cause the new file data to be zeroed and the file
-        * size to be changed.
-        */
         setprealloc = clrprealloc = 0;
-       prealloc_type = XFS_BMAPI_PREALLOC;
-
         switch (cmd) {
         case XFS_IOC_ZERO_RANGE:
-               prealloc_type |= XFS_BMAPI_CONVERT;
-               xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
-               /* FALLTHRU */
+               error = xfs_zero_file_space(ip, startoffset, bf->l_len,
+                                               attr_flags);
+               if (error)
+                       return error;
+               setprealloc = 1;
+               break;
+
         case XFS_IOC_RESVSP:
         case XFS_IOC_RESVSP64:
                 error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                               prealloc_type, attr_flags);
+                                               XFS_BMAPI_PREALLOC, attr_flags);
                 if (error)
                         return error;
                 setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h

index 447e146b2ba6d8ae1a9213841f6daae5a209f487..5163022d98089b9b6eb2fb4bdbd9df8f0861b92b 100644 (file)
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
  int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
  int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                 int flags, struct attrlist_cursor_kern *cursor);
-void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
-               xfs_off_t last, int fiopt);
-int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
-               xfs_off_t last, int fiopt);
-int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
-               xfs_off_t last, uint64_t flags, int fiopt);
-int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
  
+int xfs_iozero(struct xfs_inode *, loff_t, size_t);
  int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
  
  #endif /* _XFS_VNODEOPS_H */
author	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 12 Dec 2012 17:19:45 +0000 (09:19 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 12 Dec 2012 17:19:45 +0000 (09:19 -0800)
Documentation/filesystems/xfs.txt		patch \| blob \| history
fs/xfs/Kconfig		patch \| blob \| history
fs/xfs/Makefile		patch \| blob \| history
fs/xfs/uuid.h		patch \| blob \| history
fs/xfs/xfs_ag.h		patch \| blob \| history
fs/xfs/xfs_alloc.c		patch \| blob \| history
fs/xfs/xfs_alloc.h		patch \| blob \| history
fs/xfs/xfs_alloc_btree.c		patch \| blob \| history
fs/xfs/xfs_alloc_btree.h		patch \| blob \| history
fs/xfs/xfs_aops.c		patch \| blob \| history
fs/xfs/xfs_attr.c		patch \| blob \| history
fs/xfs/xfs_attr_leaf.c		patch \| blob \| history
fs/xfs/xfs_attr_leaf.h		patch \| blob \| history
fs/xfs/xfs_bmap.c		patch \| blob \| history
fs/xfs/xfs_bmap_btree.c		patch \| blob \| history
fs/xfs/xfs_bmap_btree.h		patch \| blob \| history
fs/xfs/xfs_btree.c		patch \| blob \| history
fs/xfs/xfs_btree.h		patch \| blob \| history
fs/xfs/xfs_buf.c		patch \| blob \| history
fs/xfs/xfs_buf.h		patch \| blob \| history
fs/xfs/xfs_cksum.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_da_btree.c		patch \| blob \| history
fs/xfs/xfs_da_btree.h		patch \| blob \| history
fs/xfs/xfs_dfrag.c		patch \| blob \| history
fs/xfs/xfs_dir2_block.c		patch \| blob \| history
fs/xfs/xfs_dir2_data.c		patch \| blob \| history
fs/xfs/xfs_dir2_leaf.c		patch \| blob \| history
fs/xfs/xfs_dir2_node.c		patch \| blob \| history
fs/xfs/xfs_dir2_priv.h		patch \| blob \| history
fs/xfs/xfs_dquot.c		patch \| blob \| history
fs/xfs/xfs_dquot.h		patch \| blob \| history
fs/xfs/xfs_export.c		patch \| blob \| history
fs/xfs/xfs_file.c		patch \| blob \| history
fs/xfs/xfs_fs.h		patch \| blob \| history
fs/xfs/xfs_fs_subr.c	[deleted file]	patch \| blob \| history
fs/xfs/xfs_fsops.c		patch \| blob \| history
fs/xfs/xfs_globals.c		patch \| blob \| history
fs/xfs/xfs_ialloc.c		patch \| blob \| history
fs/xfs/xfs_ialloc.h		patch \| blob \| history
fs/xfs/xfs_ialloc_btree.c		patch \| blob \| history
fs/xfs/xfs_ialloc_btree.h		patch \| blob \| history
fs/xfs/xfs_icache.c	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_icache.h	[new file with mode: 0644]	patch \| blob
fs/xfs/xfs_iget.c	[deleted file]	patch \| blob \| history
fs/xfs/xfs_inode.c		patch \| blob \| history
fs/xfs/xfs_inode.h		patch \| blob \| history
fs/xfs/xfs_ioctl.c		patch \| blob \| history
fs/xfs/xfs_iomap.c		patch \| blob \| history
fs/xfs/xfs_iops.c		patch \| blob \| history
fs/xfs/xfs_itable.c		patch \| blob \| history
fs/xfs/xfs_linux.h		patch \| blob \| history
fs/xfs/xfs_log.c		patch \| blob \| history
fs/xfs/xfs_log.h		patch \| blob \| history
fs/xfs/xfs_log_priv.h		patch \| blob \| history
fs/xfs/xfs_log_recover.c		patch \| blob \| history
fs/xfs/xfs_mount.c		patch \| blob \| history
fs/xfs/xfs_mount.h		patch \| blob \| history
fs/xfs/xfs_qm.c		patch \| blob \| history
fs/xfs/xfs_qm_syscalls.c		patch \| blob \| history
fs/xfs/xfs_rtalloc.c		patch \| blob \| history
fs/xfs/xfs_sb.h		patch \| blob \| history
fs/xfs/xfs_super.c		patch \| blob \| history
fs/xfs/xfs_super.h		patch \| blob \| history
fs/xfs/xfs_sync.c	[deleted file]	patch \| blob \| history
fs/xfs/xfs_sync.h	[deleted file]	patch \| blob \| history
fs/xfs/xfs_sysctl.c		patch \| blob \| history
fs/xfs/xfs_sysctl.h		patch \| blob \| history
fs/xfs/xfs_trace.h		patch \| blob \| history
fs/xfs/xfs_trans.h		patch \| blob \| history
fs/xfs/xfs_trans_buf.c		patch \| blob \| history
fs/xfs/xfs_vnodeops.c		patch \| blob \| history
fs/xfs/xfs_vnodeops.h		patch \| blob \| history