Merge branch 'for-3.5-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj...
[firefly-linux-kernel-4.4.55.git] / fs / fs-writeback.c
index 539f36cf3e4a3b574f6bb977bcbdf387571675b3..8d2fb8c88cf36a196c47f473bcc729510ad89d8e 100644 (file)
@@ -231,11 +231,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 
 static void inode_sync_complete(struct inode *inode)
 {
-       /*
-        * Prevent speculative execution through
-        * spin_unlock(&wb->list_lock);
-        */
-
+       inode->i_state &= ~I_SYNC;
+       /* Waiters must see I_SYNC cleared before being woken up */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
 }
@@ -329,10 +326,12 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
 }
 
 /*
- * Wait for writeback on an inode to complete.
+ * Wait for writeback on an inode to complete. Called with i_lock held.
+ * Caller must make sure inode cannot go away when we drop i_lock.
  */
-static void inode_wait_for_writeback(struct inode *inode,
-                                    struct bdi_writeback *wb)
+static void __inode_wait_for_writeback(struct inode *inode)
+       __releases(inode->i_lock)
+       __acquires(inode->i_lock)
 {
        DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
        wait_queue_head_t *wqh;
@@ -340,70 +339,119 @@ static void inode_wait_for_writeback(struct inode *inode,
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
        while (inode->i_state & I_SYNC) {
                spin_unlock(&inode->i_lock);
-               spin_unlock(&wb->list_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-               spin_lock(&wb->list_lock);
                spin_lock(&inode->i_lock);
        }
 }
 
 /*
- * Write out an inode's dirty pages.  Called under wb->list_lock and
- * inode->i_lock.  Either the caller has an active reference on the inode or
- * the inode has I_WILL_FREE set.
- *
- * If `wait' is set, wait on the writeout.
- *
- * The whole writeout design is quite complex and fragile.  We want to avoid
- * starvation of particular inodes when others are being redirtied, prevent
- * livelocks, etc.
+ * Wait for writeback on an inode to complete. Caller must have inode pinned.
  */
-static int
-writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
-                      struct writeback_control *wbc)
+void inode_wait_for_writeback(struct inode *inode)
 {
-       struct address_space *mapping = inode->i_mapping;
-       long nr_to_write = wbc->nr_to_write;
-       unsigned dirty;
-       int ret;
+       spin_lock(&inode->i_lock);
+       __inode_wait_for_writeback(inode);
+       spin_unlock(&inode->i_lock);
+}
 
-       assert_spin_locked(&wb->list_lock);
-       assert_spin_locked(&inode->i_lock);
+/*
+ * Sleep until I_SYNC is cleared. This function must be called with i_lock
+ * held and drops it. It is aimed for callers not holding any inode reference
+ * so once i_lock is dropped, inode can go away.
+ */
+static void inode_sleep_on_writeback(struct inode *inode)
+       __releases(inode->i_lock)
+{
+       DEFINE_WAIT(wait);
+       wait_queue_head_t *wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
+       int sleep;
 
-       if (!atomic_read(&inode->i_count))
-               WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
-       else
-               WARN_ON(inode->i_state & I_WILL_FREE);
+       prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+       sleep = inode->i_state & I_SYNC;
+       spin_unlock(&inode->i_lock);
+       if (sleep)
+               schedule();
+       finish_wait(wqh, &wait);
+}
 
-       if (inode->i_state & I_SYNC) {
+/*
+ * Find proper writeback list for the inode depending on its current state and
+ * possibly also change of its state while we were doing writeback.  Here we
+ * handle things such as livelock prevention or fairness of writeback among
+ * inodes. This function can be called only by flusher thread - noone else
+ * processes all inodes in writeback lists and requeueing inodes behind flusher
+ * thread's back can have unexpected consequences.
+ */
+static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
+                         struct writeback_control *wbc)
+{
+       if (inode->i_state & I_FREEING)
+               return;
+
+       /*
+        * Sync livelock prevention. Each inode is tagged and synced in one
+        * shot. If still dirty, it will be redirty_tail()'ed below.  Update
+        * the dirty time to prevent enqueue and sync it again.
+        */
+       if ((inode->i_state & I_DIRTY) &&
+           (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+               inode->dirtied_when = jiffies;
+
+       if (wbc->pages_skipped) {
                /*
-                * If this inode is locked for writeback and we are not doing
-                * writeback-for-data-integrity, move it to b_more_io so that
-                * writeback can proceed with the other inodes on s_io.
-                *
-                * We'll have another go at writing back this inode when we
-                * completed a full scan of b_io.
+                * writeback is not making progress due to locked
+                * buffers. Skip this inode for now.
                 */
-               if (wbc->sync_mode != WB_SYNC_ALL) {
+               redirty_tail(inode, wb);
+               return;
+       }
+
+       if (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+               /*
+                * We didn't write back all the pages.  nfs_writepages()
+                * sometimes bales out without doing anything.
+                */
+               if (wbc->nr_to_write <= 0) {
+                       /* Slice used up. Queue for next turn. */
                        requeue_io(inode, wb);
-                       trace_writeback_single_inode_requeue(inode, wbc,
-                                                            nr_to_write);
-                       return 0;
+               } else {
+                       /*
+                        * Writeback blocked by something other than
+                        * congestion. Delay the inode for some time to
+                        * avoid spinning on the CPU (100% iowait)
+                        * retrying writeback of the dirty page/inode
+                        * that cannot be performed immediately.
+                        */
+                       redirty_tail(inode, wb);
                }
-
+       } else if (inode->i_state & I_DIRTY) {
                /*
-                * It's a data-integrity sync.  We must wait.
+                * Filesystems can dirty the inode during writeback operations,
+                * such as delayed allocation during submission or metadata
+                * updates after data IO completion.
                 */
-               inode_wait_for_writeback(inode, wb);
+               redirty_tail(inode, wb);
+       } else {
+               /* The inode is clean. Remove from writeback lists. */
+               list_del_init(&inode->i_wb_list);
        }
+}
 
-       BUG_ON(inode->i_state & I_SYNC);
+/*
+ * Write out an inode and its dirty pages. Do not update the writeback list
+ * linkage. That is left to the caller. The caller is also responsible for
+ * setting I_SYNC flag and calling inode_sync_complete() to clear it.
+ */
+static int
+__writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+                        struct writeback_control *wbc)
+{
+       struct address_space *mapping = inode->i_mapping;
+       long nr_to_write = wbc->nr_to_write;
+       unsigned dirty;
+       int ret;
 
-       /* Set I_SYNC, reset I_DIRTY_PAGES */
-       inode->i_state |= I_SYNC;
-       inode->i_state &= ~I_DIRTY_PAGES;
-       spin_unlock(&inode->i_lock);
-       spin_unlock(&wb->list_lock);
+       WARN_ON(!(inode->i_state & I_SYNC));
 
        ret = do_writepages(mapping, wbc);
 
@@ -424,6 +472,9 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
         * write_inode()
         */
        spin_lock(&inode->i_lock);
+       /* Clear I_DIRTY_PAGES if we've written out all dirty pages */
+       if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
+               inode->i_state &= ~I_DIRTY_PAGES;
        dirty = inode->i_state & I_DIRTY;
        inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
        spin_unlock(&inode->i_lock);
@@ -433,60 +484,67 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
                if (ret == 0)
                        ret = err;
        }
+       trace_writeback_single_inode(inode, wbc, nr_to_write);
+       return ret;
+}
+
+/*
+ * Write out an inode's dirty pages. Either the caller has an active reference
+ * on the inode or the inode has I_WILL_FREE set.
+ *
+ * This function is designed to be called for writing back one inode which
+ * we go e.g. from filesystem. Flusher thread uses __writeback_single_inode()
+ * and does more profound writeback list handling in writeback_sb_inodes().
+ */
+static int
+writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+                      struct writeback_control *wbc)
+{
+       int ret = 0;
 
-       spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
-       inode->i_state &= ~I_SYNC;
-       if (!(inode->i_state & I_FREEING)) {
+       if (!atomic_read(&inode->i_count))
+               WARN_ON(!(inode->i_state & (I_WILL_FREE|I_FREEING)));
+       else
+               WARN_ON(inode->i_state & I_WILL_FREE);
+
+       if (inode->i_state & I_SYNC) {
+               if (wbc->sync_mode != WB_SYNC_ALL)
+                       goto out;
                /*
-                * Sync livelock prevention. Each inode is tagged and synced in
-                * one shot. If still dirty, it will be redirty_tail()'ed below.
-                * Update the dirty time to prevent enqueue and sync it again.
+                * It's a data-integrity sync. We must wait. Since callers hold
+                * inode reference or inode has I_WILL_FREE set, it cannot go
+                * away under us.
                 */
-               if ((inode->i_state & I_DIRTY) &&
-                   (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
-                       inode->dirtied_when = jiffies;
-
-               if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-                       /*
-                        * We didn't write back all the pages.  nfs_writepages()
-                        * sometimes bales out without doing anything.
-                        */
-                       inode->i_state |= I_DIRTY_PAGES;
-                       if (wbc->nr_to_write <= 0) {
-                               /*
-                                * slice used up: queue for next turn
-                                */
-                               requeue_io(inode, wb);
-                       } else {
-                               /*
-                                * Writeback blocked by something other than
-                                * congestion. Delay the inode for some time to
-                                * avoid spinning on the CPU (100% iowait)
-                                * retrying writeback of the dirty page/inode
-                                * that cannot be performed immediately.
-                                */
-                               redirty_tail(inode, wb);
-                       }
-               } else if (inode->i_state & I_DIRTY) {
-                       /*
-                        * Filesystems can dirty the inode during writeback
-                        * operations, such as delayed allocation during
-                        * submission or metadata updates after data IO
-                        * completion.
-                        */
-                       redirty_tail(inode, wb);
-               } else {
-                       /*
-                        * The inode is clean.  At this point we either have
-                        * a reference to the inode or it's on it's way out.
-                        * No need to add it back to the LRU.
-                        */
-                       list_del_init(&inode->i_wb_list);
-               }
+               __inode_wait_for_writeback(inode);
        }
+       WARN_ON(inode->i_state & I_SYNC);
+       /*
+        * Skip inode if it is clean. We don't want to mess with writeback
+        * lists in this function since flusher thread may be doing for example
+        * sync in parallel and if we move the inode, it could get skipped. So
+        * here we make sure inode is on some writeback list and leave it there
+        * unless we have completely cleaned the inode.
+        */
+       if (!(inode->i_state & I_DIRTY))
+               goto out;
+       inode->i_state |= I_SYNC;
+       spin_unlock(&inode->i_lock);
+
+       ret = __writeback_single_inode(inode, wb, wbc);
+
+       spin_lock(&wb->list_lock);
+       spin_lock(&inode->i_lock);
+       /*
+        * If inode is clean, remove it from writeback lists. Otherwise don't
+        * touch it. See comment above for explanation.
+        */
+       if (!(inode->i_state & I_DIRTY))
+               list_del_init(&inode->i_wb_list);
+       spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
-       trace_writeback_single_inode(inode, wbc, nr_to_write);
+out:
+       spin_unlock(&inode->i_lock);
        return ret;
 }
 
@@ -580,29 +638,57 @@ static long writeback_sb_inodes(struct super_block *sb,
                        redirty_tail(inode, wb);
                        continue;
                }
-               __iget(inode);
+               if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
+                       /*
+                        * If this inode is locked for writeback and we are not
+                        * doing writeback-for-data-integrity, move it to
+                        * b_more_io so that writeback can proceed with the
+                        * other inodes on s_io.
+                        *
+                        * We'll have another go at writing back this inode
+                        * when we completed a full scan of b_io.
+                        */
+                       spin_unlock(&inode->i_lock);
+                       requeue_io(inode, wb);
+                       trace_writeback_sb_inodes_requeue(inode);
+                       continue;
+               }
+               spin_unlock(&wb->list_lock);
+
+               /*
+                * We already requeued the inode if it had I_SYNC set and we
+                * are doing WB_SYNC_NONE writeback. So this catches only the
+                * WB_SYNC_ALL case.
+                */
+               if (inode->i_state & I_SYNC) {
+                       /* Wait for I_SYNC. This function drops i_lock... */
+                       inode_sleep_on_writeback(inode);
+                       /* Inode may be gone, start again */
+                       continue;
+               }
+               inode->i_state |= I_SYNC;
+               spin_unlock(&inode->i_lock);
+
                write_chunk = writeback_chunk_size(wb->bdi, work);
                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
 
-               writeback_single_inode(inode, wb, &wbc);
+               /*
+                * We use I_SYNC to pin the inode in memory. While it is set
+                * evict_inode() will wait so the inode cannot be freed.
+                */
+               __writeback_single_inode(inode, wb, &wbc);
 
                work->nr_pages -= write_chunk - wbc.nr_to_write;
                wrote += write_chunk - wbc.nr_to_write;
+               spin_lock(&wb->list_lock);
+               spin_lock(&inode->i_lock);
                if (!(inode->i_state & I_DIRTY))
                        wrote++;
-               if (wbc.pages_skipped) {
-                       /*
-                        * writeback is not making progress due to locked
-                        * buffers.  Skip this inode for now.
-                        */
-                       redirty_tail(inode, wb);
-               }
+               requeue_inode(inode, wb, &wbc);
+               inode_sync_complete(inode);
                spin_unlock(&inode->i_lock);
-               spin_unlock(&wb->list_lock);
-               iput(inode);
-               cond_resched();
-               spin_lock(&wb->list_lock);
+               cond_resched_lock(&wb->list_lock);
                /*
                 * bail out to wb_writeback() often enough to check
                 * background threshold and other termination conditions.
@@ -796,8 +882,10 @@ static long wb_writeback(struct bdi_writeback *wb,
                        trace_writeback_wait(wb->bdi, work);
                        inode = wb_inode(wb->b_more_io.prev);
                        spin_lock(&inode->i_lock);
-                       inode_wait_for_writeback(inode, wb);
-                       spin_unlock(&inode->i_lock);
+                       spin_unlock(&wb->list_lock);
+                       /* This function drops i_lock... */
+                       inode_sleep_on_writeback(inode);
+                       spin_lock(&wb->list_lock);
                }
        }
        spin_unlock(&wb->list_lock);
@@ -1331,7 +1419,6 @@ EXPORT_SYMBOL(sync_inodes_sb);
 int write_inode_now(struct inode *inode, int sync)
 {
        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-       int ret;
        struct writeback_control wbc = {
                .nr_to_write = LONG_MAX,
                .sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
@@ -1343,12 +1430,7 @@ int write_inode_now(struct inode *inode, int sync)
                wbc.nr_to_write = 0;
 
        might_sleep();
-       spin_lock(&wb->list_lock);
-       spin_lock(&inode->i_lock);
-       ret = writeback_single_inode(inode, wb, &wbc);
-       spin_unlock(&inode->i_lock);
-       spin_unlock(&wb->list_lock);
-       return ret;
+       return writeback_single_inode(inode, wb, &wbc);
 }
 EXPORT_SYMBOL(write_inode_now);
 
@@ -1365,15 +1447,7 @@ EXPORT_SYMBOL(write_inode_now);
  */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
-       struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-       int ret;
-
-       spin_lock(&wb->list_lock);
-       spin_lock(&inode->i_lock);
-       ret = writeback_single_inode(inode, wb, wbc);
-       spin_unlock(&inode->i_lock);
-       spin_unlock(&wb->list_lock);
-       return ret;
+       return writeback_single_inode(inode, &inode_to_bdi(inode)->wb, wbc);
 }
 EXPORT_SYMBOL(sync_inode);