Merge tag 'xfs-for-linus-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/dgc...

[firefly-linux-kernel-4.4.55.git] / fs / xfs / xfs_file.c
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c

index f80e90f95ad8d766e34890326bf33f77a03ba125..f5392ab2def1ab806aa075bcc19643cbac6ca8f9 100644 (file)
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -242,19 +242,30 @@ xfs_file_fsync(
         }
  
         /*
-        * All metadata updates are logged, which means that we just have
-        * to flush the log up to the latest LSN that touched the inode.
+        * All metadata updates are logged, which means that we just have to
+        * flush the log up to the latest LSN that touched the inode. If we have
+        * concurrent fsync/fdatasync() calls, we need them to all block on the
+        * log force before we clear the ili_fsync_fields field. This ensures
+        * that we don't get a racing sync operation that does not wait for the
+        * metadata to hit the journal before returning. If we race with
+        * clearing the ili_fsync_fields, then all that will happen is the log
+        * force will do nothing as the lsn will already be on disk. We can't
+        * race with setting ili_fsync_fields because that is done under
+        * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
+        * until after the ili_fsync_fields is cleared.
          */
         xfs_ilock(ip, XFS_ILOCK_SHARED);
         if (xfs_ipincount(ip)) {
                 if (!datasync ||
-                   (ip->i_itemp->ili_fields & ~XFS_ILOG_TIMESTAMP))
+                   (ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
                         lsn = ip->i_itemp->ili_last_lsn;
         }
-       xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
-       if (lsn)
+       if (lsn) {
                 error = _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
+               ip->i_itemp->ili_fsync_fields = 0;
+       }
+       xfs_iunlock(ip, XFS_ILOCK_SHARED);
  
         /*
          * If we only have a single device, and the log force about was
@@ -287,7 +298,7 @@ xfs_file_read_iter(
         xfs_fsize_t             n;
         loff_t                  pos = iocb->ki_pos;
  
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(mp, xs_read_calls);
  
         if (unlikely(iocb->ki_flags & IOCB_DIRECT))
                 ioflags |= XFS_IO_ISDIRECT;
@@ -365,7 +376,7 @@ xfs_file_read_iter(
  
         ret = generic_file_read_iter(iocb, to);
         if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(mp, xs_read_bytes, ret);
  
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
@@ -383,7 +394,7 @@ xfs_file_splice_read(
         int                     ioflags = 0;
         ssize_t                 ret;
  
-       XFS_STATS_INC(xs_read_calls);
+       XFS_STATS_INC(ip->i_mount, xs_read_calls);
  
         if (infilp->f_mode & FMODE_NOCMTIME)
                 ioflags |= XFS_IO_INVIS;
@@ -401,7 +412,7 @@ xfs_file_splice_read(
         else
                 ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
         if (ret > 0)
-               XFS_STATS_ADD(xs_read_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
  
         xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
         return ret;
@@ -482,6 +493,8 @@ xfs_zero_eof(
         ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
         ASSERT(offset > isize);
  
+       trace_xfs_zero_eof(ip, isize, offset - isize);
+
         /*
          * First handle zeroing the block on which isize resides.
          *
@@ -574,6 +587,7 @@ xfs_file_aio_write_checks(
         struct xfs_inode        *ip = XFS_I(inode);
         ssize_t                 error = 0;
         size_t                  count = iov_iter_count(from);
+       bool                    drained_dio = false;
  
  restart:
         error = generic_write_checks(iocb, from);
@@ -611,12 +625,13 @@ restart:
                 bool    zero = false;
  
                 spin_unlock(&ip->i_flags_lock);
-               if (*iolock == XFS_IOLOCK_SHARED) {
-                       xfs_rw_iunlock(ip, *iolock);
-                       *iolock = XFS_IOLOCK_EXCL;
-                       xfs_rw_ilock(ip, *iolock);
-                       iov_iter_reexpand(from, count);
-
+               if (!drained_dio) {
+                       if (*iolock == XFS_IOLOCK_SHARED) {
+                               xfs_rw_iunlock(ip, *iolock);
+                               *iolock = XFS_IOLOCK_EXCL;
+                               xfs_rw_ilock(ip, *iolock);
+                               iov_iter_reexpand(from, count);
+                       }
                         /*
                          * We now have an IO submission barrier in place, but
                          * AIO can do EOF updates during IO completion and hence
@@ -626,6 +641,7 @@ restart:
                          * no-op.
                          */
                         inode_dio_wait(inode);
+                       drained_dio = true;
                         goto restart;
                 }
                 error = xfs_zero_eof(ip, iocb->ki_pos, i_size_read(inode), &zero);
@@ -867,7 +883,7 @@ xfs_file_write_iter(
         ssize_t                 ret;
         size_t                  ocount = iov_iter_count(from);
  
-       XFS_STATS_INC(xs_write_calls);
+       XFS_STATS_INC(ip->i_mount, xs_write_calls);
  
         if (ocount == 0)
                 return 0;
@@ -883,7 +899,7 @@ xfs_file_write_iter(
         if (ret > 0) {
                 ssize_t err;
  
-               XFS_STATS_ADD(xs_write_bytes, ret);
+               XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
  
                 /* Handle various SYNC-type writes */
                 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
@@ -1477,7 +1493,7 @@ xfs_file_llseek(
   *
   * mmap_sem (MM)
   *   sb_start_pagefault(vfs, freeze)
- *     i_mmap_lock (XFS - truncate serialisation)
+ *     i_mmaplock (XFS - truncate serialisation)
   *       page_lock (MM)
   *         i_lock (XFS - extent map serialisation)
   */
@@ -1503,8 +1519,7 @@ xfs_filemap_page_mkwrite(
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  
         if (IS_DAX(inode)) {
-               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_dax_fault, NULL);
         } else {
                 ret = block_page_mkwrite(vma, vmf, xfs_get_blocks);
                 ret = block_page_mkwrite_return(ret);
@@ -1538,7 +1553,7 @@ xfs_filemap_fault(
                  * changes to xfs_get_blocks_direct() to map unwritten extent
                  * ioend for conversion on read-only mappings.
                  */
-               ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL);
+               ret = __dax_fault(vma, vmf, xfs_get_blocks_dax_fault, NULL);
         } else
                 ret = filemap_fault(vma, vmf);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
@@ -1546,6 +1561,13 @@ xfs_filemap_fault(
         return ret;
  }
  
+/*
+ * Similar to xfs_filemap_fault(), the DAX fault path can call into here on
+ * both read and write faults. Hence we need to handle both cases. There is no
+ * ->pmd_mkwrite callout for huge pages, so we have a single function here to
+ * handle both cases here. @flags carries the information on the type of fault
+ * occuring.
+ */
  STATIC int
  xfs_filemap_pmd_fault(
         struct vm_area_struct   *vma,
@@ -1562,15 +1584,54 @@ xfs_filemap_pmd_fault(
  
         trace_xfs_filemap_pmd_fault(ip);
  
-       sb_start_pagefault(inode->i_sb);
-       file_update_time(vma->vm_file);
+       if (flags & FAULT_FLAG_WRITE) {
+               sb_start_pagefault(inode->i_sb);
+               file_update_time(vma->vm_file);
+       }
+
         xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_direct,
-                                   xfs_end_io_dax_write);
+       ret = __dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault,
+                             NULL);
         xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-       sb_end_pagefault(inode->i_sb);
  
+       if (flags & FAULT_FLAG_WRITE)
+               sb_end_pagefault(inode->i_sb);
+
+       return ret;
+}
+
+/*
+ * pfn_mkwrite was originally inteneded to ensure we capture time stamp
+ * updates on write faults. In reality, it's need to serialise against
+ * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
+ * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
+ * barrier in place.
+ */
+static int
+xfs_filemap_pfn_mkwrite(
+       struct vm_area_struct   *vma,
+       struct vm_fault         *vmf)
+{
+
+       struct inode            *inode = file_inode(vma->vm_file);
+       struct xfs_inode        *ip = XFS_I(inode);
+       int                     ret = VM_FAULT_NOPAGE;
+       loff_t                  size;
+
+       trace_xfs_filemap_pfn_mkwrite(ip);
+
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+
+       /* check if the faulting page hasn't raced with truncate */
+       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
+       size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       if (vmf->pgoff >= size)
+               ret = VM_FAULT_SIGBUS;
+       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
         return ret;
+
  }
  
  static const struct vm_operations_struct xfs_file_vm_ops = {
@@ -1578,6 +1639,7 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
         .pmd_fault      = xfs_filemap_pmd_fault,
         .map_pages      = filemap_map_pages,
         .page_mkwrite   = xfs_filemap_page_mkwrite,
+       .pfn_mkwrite    = xfs_filemap_pfn_mkwrite,
  };
  
  STATIC int