Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
Pull cgroup writeback support from Jens Axboe:
 "This is the big pull request for adding cgroup writeback support.

  This code has been in development for a long time, and it has been
  simmering in for-next for a good chunk of this cycle too.  This is one
  of those problems that has been talked about for at least half a
  decade, finally there's a solution and code to go with it.

  Also see last weeks writeup on LWN:

        http://lwn.net/Articles/648292/"

* 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits)
  writeback, blkio: add documentation for cgroup writeback support
  vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB
  writeback: do foreign inode detection iff cgroup writeback is enabled
  v9fs: fix error handling in v9fs_session_init()
  bdi: fix wrong error return value in cgwb_create()
  buffer: remove unusued 'ret' variable
  writeback: disassociate inodes from dying bdi_writebacks
  writeback: implement foreign cgroup inode bdi_writeback switching
  writeback: add lockdep annotation to inode_to_wb()
  writeback: use unlocked_inode_to_wb transaction in inode_congested()
  writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
  writeback: implement [locked_]inode_to_wb_and_lock_list()
  writeback: implement foreign cgroup inode detection
  writeback: make writeback_control track the inode being written back
  writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
  mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
  writeback: implement memcg writeback domain based throttling
  writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes
  writeback: implement memcg wb_domain
  writeback: update wb_over_bg_thresh() to use wb_domain aware operations
  ...

75 files changed:
Documentation/cgroups/blkio-controller.txt
Documentation/cgroups/memory.txt
block/bio.c
block/blk-cgroup.c
block/blk-cgroup.h [deleted file]
block/blk-core.c
block/blk-integrity.c
block/blk-sysfs.c
block/blk-throttle.c
block/bounce.c
block/cfq-iosched.c
block/elevator.c
block/genhd.c
drivers/block/drbd/drbd_int.h
drivers/block/drbd/drbd_main.c
drivers/block/pktcdvd.c
drivers/char/raw.c
drivers/md/bcache/request.c
drivers/md/dm.c
drivers/md/dm.h
drivers/md/md.h
drivers/md/raid1.c
drivers/md/raid10.c
drivers/mtd/devices/block2mtd.c
drivers/staging/lustre/lustre/include/linux/lustre_patchless_compat.h
fs/9p/v9fs.c
fs/9p/vfs_super.c
fs/block_dev.c
fs/buffer.c
fs/ext2/super.c
fs/ext4/extents.c
fs/ext4/mballoc.c
fs/ext4/super.c
fs/f2fs/node.c
fs/f2fs/segment.h
fs/fat/file.c
fs/fat/inode.c
fs/fs-writeback.c
fs/fuse/file.c
fs/gfs2/super.c
fs/hfs/super.c
fs/hfsplus/super.c
fs/inode.c
fs/mpage.c
fs/nfs/filelayout/filelayout.c
fs/nfs/internal.h
fs/nfs/write.c
fs/ocfs2/file.c
fs/reiserfs/super.c
fs/ufs/super.c
fs/xfs/xfs_aops.c
fs/xfs/xfs_file.c
include/linux/backing-dev-defs.h [new file with mode: 0644]
include/linux/backing-dev.h
include/linux/bio.h
include/linux/blk-cgroup.h [new file with mode: 0644]
include/linux/blkdev.h
include/linux/cgroup.h
include/linux/fs.h
include/linux/memcontrol.h
include/linux/mm.h
include/linux/pagemap.h
include/linux/writeback.h
include/trace/events/writeback.h
init/Kconfig
mm/backing-dev.c
mm/fadvise.c
mm/filemap.c
mm/madvise.c
mm/memcontrol.c
mm/page-writeback.c
mm/readahead.c
mm/rmap.c
mm/truncate.c
mm/vmscan.c

index cd556b9147868fdd4ba095a9a31f675ac0eed6d4..68b6a6a470b073436b15aa18ef8ba822e39602b4 100644 (file)
@@ -387,8 +387,81 @@ groups and put applications in that group which are not driving enough
 IO to keep disk busy. In that case set group_idle=0, and CFQ will not idle
 on individual groups and throughput should improve.
 
-What works
-==========
-- Currently only sync IO queues are support. All the buffered writes are
-  still system wide and not per group. Hence we will not see service
-  differentiation between buffered writes between groups.
+Writeback
+=========
+
+Page cache is dirtied through buffered writes and shared mmaps and
+written asynchronously to the backing filesystem by the writeback
+mechanism.  Writeback sits between the memory and IO domains and
+regulates the proportion of dirty memory by balancing dirtying and
+write IOs.
+
+On traditional cgroup hierarchies, relationships between different
+controllers cannot be established making it impossible for writeback
+to operate accounting for cgroup resource restrictions and all
+writeback IOs are attributed to the root cgroup.
+
+If both the blkio and memory controllers are used on the v2 hierarchy
+and the filesystem supports cgroup writeback, writeback operations
+correctly follow the resource restrictions imposed by both memory and
+blkio controllers.
+
+Writeback examines both system-wide and per-cgroup dirty memory status
+and enforces the more restrictive of the two.  Also, writeback control
+parameters which are absolute values - vm.dirty_bytes and
+vm.dirty_background_bytes - are distributed across cgroups according
+to their current writeback bandwidth.
+
+There's a peculiarity stemming from the discrepancy in ownership
+granularity between memory controller and writeback.  While memory
+controller tracks ownership per page, writeback operates on inode
+basis.  cgroup writeback bridges the gap by tracking ownership by
+inode but migrating ownership if too many foreign pages, pages which
+don't match the current inode ownership, have been encountered while
+writing back the inode.
+
+This is a conscious design choice as writeback operations are
+inherently tied to inodes making strictly following page ownership
+complicated and inefficient.  The only use case which suffers from
+this compromise is multiple cgroups concurrently dirtying disjoint
+regions of the same inode, which is an unlikely use case and decided
+to be unsupported.  Note that as memory controller assigns page
+ownership on the first use and doesn't update it until the page is
+released, even if cgroup writeback strictly follows page ownership,
+multiple cgroups dirtying overlapping areas wouldn't work as expected.
+In general, write-sharing an inode across multiple cgroups is not well
+supported.
+
+Filesystem support for cgroup writeback
+---------------------------------------
+
+A filesystem can make writeback IOs cgroup-aware by updating
+address_space_operations->writepage[s]() to annotate bio's using the
+following two functions.
+
+* wbc_init_bio(@wbc, @bio)
+
+  Should be called for each bio carrying writeback data and associates
+  the bio with the inode's owner cgroup.  Can be called anytime
+  between bio allocation and submission.
+
+* wbc_account_io(@wbc, @page, @bytes)
+
+  Should be called for each data segment being written out.  While
+  this function doesn't care exactly when it's called during the
+  writeback session, it's the easiest and most natural to call it as
+  data segments are added to a bio.
+
+With writeback bio's annotated, cgroup support can be enabled per
+super_block by setting MS_CGROUPWB in ->s_flags.  This allows for
+selective disabling of cgroup writeback support which is helpful when
+certain filesystem features, e.g. journaled data mode, are
+incompatible.
+
+wbc_init_bio() binds the specified bio to its cgroup.  Depending on
+the configuration, the bio may be executed at a lower priority and if
+the writeback session is holding shared resources, e.g. a journal
+entry, may lead to priority inversion.  There is no one easy solution
+for the problem.  Filesystems can try to work around specific problem
+cases by skipping wbc_init_bio() or using bio_associate_blkcg()
+directly.
index f456b4315e86d80abe8d4d0b4fbb68eeca5b4e4a..ff71e16cc7524881dfdae90f321a98e56e51a914 100644 (file)
@@ -493,6 +493,7 @@ pgpgin              - # of charging events to the memory cgroup. The charging
 pgpgout                - # of uncharging events to the memory cgroup. The uncharging
                event happens each time a page is unaccounted from the cgroup.
 swap           - # of bytes of swap usage
+dirty          - # of bytes that are waiting to get written back to the disk.
 writeback      - # of bytes of file/anon cache that are queued for syncing to
                disk.
 inactive_anon  - # of bytes of anonymous and swap cache memory on inactive
index 259197d97de1a6e5705457f58dc7727665ab1a93..2a00d349cd6883cba32d9fd477251889a1c58081 100644 (file)
@@ -1988,6 +1988,28 @@ struct bio_set *bioset_create_nobvec(unsigned int pool_size, unsigned int front_
 EXPORT_SYMBOL(bioset_create_nobvec);
 
 #ifdef CONFIG_BLK_CGROUP
+
+/**
+ * bio_associate_blkcg - associate a bio with the specified blkcg
+ * @bio: target bio
+ * @blkcg_css: css of the blkcg to associate
+ *
+ * Associate @bio with the blkcg specified by @blkcg_css.  Block layer will
+ * treat @bio as if it were issued by a task which belongs to the blkcg.
+ *
+ * This function takes an extra reference of @blkcg_css which will be put
+ * when @bio is released.  The caller must own @bio and is responsible for
+ * synchronizing calls to this function.
+ */
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css)
+{
+       if (unlikely(bio->bi_css))
+               return -EBUSY;
+       css_get(blkcg_css);
+       bio->bi_css = blkcg_css;
+       return 0;
+}
+
 /**
  * bio_associate_current - associate a bio with %current
  * @bio: target bio
@@ -2004,26 +2026,17 @@ EXPORT_SYMBOL(bioset_create_nobvec);
 int bio_associate_current(struct bio *bio)
 {
        struct io_context *ioc;
-       struct cgroup_subsys_state *css;
 
-       if (bio->bi_ioc)
+       if (bio->bi_css)
                return -EBUSY;
 
        ioc = current->io_context;
        if (!ioc)
                return -ENOENT;
 
-       /* acquire active ref on @ioc and associate */
        get_io_context_active(ioc);
        bio->bi_ioc = ioc;
-
-       /* associate blkcg if exists */
-       rcu_read_lock();
-       css = task_css(current, blkio_cgrp_id);
-       if (css && css_tryget_online(css))
-               bio->bi_css = css;
-       rcu_read_unlock();
-
+       bio->bi_css = task_get_css(current, blkio_cgrp_id);
        return 0;
 }
 
index 6e43fa355e7127e8e2b10ff33eee5c0ab43ccf90..9f97da52d006281b1ab3e2911d85934216e3931a 100644 (file)
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 #define MAX_KEY_LEN 100
@@ -33,6 +34,8 @@ static DEFINE_MUTEX(blkcg_pol_mutex);
 struct blkcg blkcg_root;
 EXPORT_SYMBOL_GPL(blkcg_root);
 
+struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+
 static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
 
 static bool blkcg_policy_enabled(struct request_queue *q,
@@ -182,6 +185,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                                    struct blkcg_gq *new_blkg)
 {
        struct blkcg_gq *blkg;
+       struct bdi_writeback_congested *wb_congested;
        int i, ret;
 
        WARN_ON_ONCE(!rcu_read_lock_held());
@@ -193,22 +197,30 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
                goto err_free_blkg;
        }
 
+       wb_congested = wb_congested_get_create(&q->backing_dev_info,
+                                              blkcg->css.id, GFP_ATOMIC);
+       if (!wb_congested) {
+               ret = -ENOMEM;
+               goto err_put_css;
+       }
+
        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
-                       goto err_put_css;
+                       goto err_put_congested;
                }
        }
        blkg = new_blkg;
+       blkg->wb_congested = wb_congested;
 
        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -EINVAL;
-                       goto err_put_css;
+                       goto err_put_congested;
                }
                blkg_get(blkg->parent);
        }
@@ -238,18 +250,15 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
        blkg->online = true;
        spin_unlock(&blkcg->lock);
 
-       if (!ret) {
-               if (blkcg == &blkcg_root) {
-                       q->root_blkg = blkg;
-                       q->root_rl.blkg = blkg;
-               }
+       if (!ret)
                return blkg;
-       }
 
        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);
 
+err_put_congested:
+       wb_congested_put(wb_congested);
 err_put_css:
        css_put(&blkcg->css);
 err_free_blkg:
@@ -342,15 +351,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);
 
-       /*
-        * If root blkg is destroyed.  Just clear the pointer since root_rl
-        * does not take reference on root blkg.
-        */
-       if (blkcg == &blkcg_root) {
-               blkg->q->root_blkg = NULL;
-               blkg->q->root_rl.blkg = NULL;
-       }
-
        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
@@ -405,6 +405,8 @@ void __blkg_release_rcu(struct rcu_head *rcu_head)
        if (blkg->parent)
                blkg_put(blkg->parent);
 
+       wb_congested_put(blkg->wb_congested);
+
        blkg_free(blkg);
 }
 EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@ -812,6 +814,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
        }
 
        spin_unlock_irq(&blkcg->lock);
+
+       wb_blkcg_offline(blkcg);
 }
 
 static void blkcg_css_free(struct cgroup_subsys_state *css)
@@ -868,7 +872,9 @@ done:
        spin_lock_init(&blkcg->lock);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
        return &blkcg->css;
 
 free_pd_blkcg:
@@ -892,9 +898,45 @@ free_blkcg:
  */
 int blkcg_init_queue(struct request_queue *q)
 {
-       might_sleep();
+       struct blkcg_gq *new_blkg, *blkg;
+       bool preloaded;
+       int ret;
+
+       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+       if (!new_blkg)
+               return -ENOMEM;
+
+       preloaded = !radix_tree_preload(GFP_KERNEL);
+
+       /*
+        * Make sure the root blkg exists and count the existing blkgs.  As
+        * @q is bypassing at this point, blkg_lookup_create() can't be
+        * used.  Open code insertion.
+        */
+       rcu_read_lock();
+       spin_lock_irq(q->queue_lock);
+       blkg = blkg_create(&blkcg_root, q, new_blkg);
+       spin_unlock_irq(q->queue_lock);
+       rcu_read_unlock();
+
+       if (preloaded)
+               radix_tree_preload_end();
+
+       if (IS_ERR(blkg)) {
+               kfree(new_blkg);
+               return PTR_ERR(blkg);
+       }
 
-       return blk_throtl_init(q);
+       q->root_blkg = blkg;
+       q->root_rl.blkg = blkg;
+
+       ret = blk_throtl_init(q);
+       if (ret) {
+               spin_lock_irq(q->queue_lock);
+               blkg_destroy_all(q);
+               spin_unlock_irq(q->queue_lock);
+       }
+       return ret;
 }
 
 /**
@@ -996,50 +1038,19 @@ int blkcg_activate_policy(struct request_queue *q,
 {
        LIST_HEAD(pds);
        LIST_HEAD(cpds);
-       struct blkcg_gq *blkg, *new_blkg;
+       struct blkcg_gq *blkg;
        struct blkg_policy_data *pd, *nd;
        struct blkcg_policy_data *cpd, *cnd;
        int cnt = 0, ret;
-       bool preloaded;
 
        if (blkcg_policy_enabled(q, pol))
                return 0;
 
-       /* preallocations for root blkg */
-       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-       if (!new_blkg)
-               return -ENOMEM;
-
+       /* count and allocate policy_data for all existing blkgs */
        blk_queue_bypass_start(q);
-
-       preloaded = !radix_tree_preload(GFP_KERNEL);
-
-       /*
-        * Make sure the root blkg exists and count the existing blkgs.  As
-        * @q is bypassing at this point, blkg_lookup_create() can't be
-        * used.  Open code it.
-        */
        spin_lock_irq(q->queue_lock);
-
-       rcu_read_lock();
-       blkg = __blkg_lookup(&blkcg_root, q, false);
-       if (blkg)
-               blkg_free(new_blkg);
-       else
-               blkg = blkg_create(&blkcg_root, q, new_blkg);
-       rcu_read_unlock();
-
-       if (preloaded)
-               radix_tree_preload_end();
-
-       if (IS_ERR(blkg)) {
-               ret = PTR_ERR(blkg);
-               goto out_unlock;
-       }
-
        list_for_each_entry(blkg, &q->blkg_list, q_node)
                cnt++;
-
        spin_unlock_irq(q->queue_lock);
 
        /*
@@ -1140,10 +1151,6 @@ void blkcg_deactivate_policy(struct request_queue *q,
 
        __clear_bit(pol->plid, q->blkcg_pols);
 
-       /* if no policy is left, no need for blkgs - shoot them down */
-       if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
-               blkg_destroy_all(q);
-
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                /* grab blkcg lock too while removing @pd from @blkg */
                spin_lock(&blkg->blkcg->lock);
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
deleted file mode 100644 (file)
index 74296a7..0000000
+++ /dev/null
@@ -1,627 +0,0 @@
-#ifndef _BLK_CGROUP_H
-#define _BLK_CGROUP_H
-/*
- * Common Block IO controller cgroup interface
- *
- * Based on ideas and code from CFQ, CFS and BFQ:
- * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
- *
- * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
- *                   Paolo Valente <paolo.valente@unimore.it>
- *
- * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
- *                   Nauman Rafique <nauman@google.com>
- */
-
-#include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
-#include <linux/seq_file.h>
-#include <linux/radix-tree.h>
-#include <linux/blkdev.h>
-#include <linux/atomic.h>
-
-/* Max limits for throttle policy */
-#define THROTL_IOPS_MAX                UINT_MAX
-
-#ifdef CONFIG_BLK_CGROUP
-
-enum blkg_rwstat_type {
-       BLKG_RWSTAT_READ,
-       BLKG_RWSTAT_WRITE,
-       BLKG_RWSTAT_SYNC,
-       BLKG_RWSTAT_ASYNC,
-
-       BLKG_RWSTAT_NR,
-       BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
-};
-
-struct blkcg_gq;
-
-struct blkcg {
-       struct cgroup_subsys_state      css;
-       spinlock_t                      lock;
-
-       struct radix_tree_root          blkg_tree;
-       struct blkcg_gq                 *blkg_hint;
-       struct hlist_head               blkg_list;
-
-       struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
-};
-
-struct blkg_stat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt;
-};
-
-struct blkg_rwstat {
-       struct u64_stats_sync           syncp;
-       uint64_t                        cnt[BLKG_RWSTAT_NR];
-};
-
-/*
- * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
- * request_queue (q).  This is used by blkcg policies which need to track
- * information per blkcg - q pair.
- *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
- */
-struct blkg_policy_data {
-       /* the blkg and policy id this per-policy data belongs to */
-       struct blkcg_gq                 *blkg;
-       int                             plid;
-
-       /* used during policy activation */
-       struct list_head                alloc_node;
-};
-
-/*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it. blkcg core allocates
- * policy-specific per-blkcg structures lazily the first time
- * they are actually needed, so it handles them together with
- * blkgs. cpd_init() is invoked to let each policy handle
- * per-blkcg data.
- */
-struct blkcg_policy_data {
-       /* the policy id this per-policy data belongs to */
-       int                             plid;
-
-       /* used during policy activation */
-       struct list_head                alloc_node;
-};
-
-/* association between a blk cgroup and a request queue */
-struct blkcg_gq {
-       /* Pointer to the associated request_queue */
-       struct request_queue            *q;
-       struct list_head                q_node;
-       struct hlist_node               blkcg_node;
-       struct blkcg                    *blkcg;
-
-       /* all non-root blkcg_gq's are guaranteed to have access to parent */
-       struct blkcg_gq                 *parent;
-
-       /* request allocation list for this blkcg-q pair */
-       struct request_list             rl;
-
-       /* reference count */
-       atomic_t                        refcnt;
-
-       /* is this blkg online? protected by both blkcg and q locks */
-       bool                            online;
-
-       struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
-
-       struct rcu_head                 rcu_head;
-};
-
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
-
-struct blkcg_policy {
-       int                             plid;
-       /* policy specific private data size */
-       size_t                          pd_size;
-       /* policy specific per-blkcg data size */
-       size_t                          cpd_size;
-       /* cgroup files for the policy */
-       struct cftype                   *cftypes;
-
-       /* operations */
-       blkcg_pol_init_cpd_fn           *cpd_init_fn;
-       blkcg_pol_init_pd_fn            *pd_init_fn;
-       blkcg_pol_online_pd_fn          *pd_online_fn;
-       blkcg_pol_offline_pd_fn         *pd_offline_fn;
-       blkcg_pol_exit_pd_fn            *pd_exit_fn;
-       blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
-};
-
-extern struct blkcg blkcg_root;
-
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
-                                   struct request_queue *q);
-int blkcg_init_queue(struct request_queue *q);
-void blkcg_drain_queue(struct request_queue *q);
-void blkcg_exit_queue(struct request_queue *q);
-
-/* Blkio controller policy registration */
-int blkcg_policy_register(struct blkcg_policy *pol);
-void blkcg_policy_unregister(struct blkcg_policy *pol);
-int blkcg_activate_policy(struct request_queue *q,
-                         const struct blkcg_policy *pol);
-void blkcg_deactivate_policy(struct request_queue *q,
-                            const struct blkcg_policy *pol);
-
-void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
-                      u64 (*prfill)(struct seq_file *,
-                                    struct blkg_policy_data *, int),
-                      const struct blkcg_policy *pol, int data,
-                      bool show_total);
-u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-                        const struct blkg_rwstat *rwstat);
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
-                      int off);
-
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-                                            int off);
-
-struct blkg_conf_ctx {
-       struct gendisk                  *disk;
-       struct blkcg_gq                 *blkg;
-       u64                             v;
-};
-
-int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-                  const char *input, struct blkg_conf_ctx *ctx);
-void blkg_conf_finish(struct blkg_conf_ctx *ctx);
-
-
-static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
-{
-       return css ? container_of(css, struct blkcg, css) : NULL;
-}
-
-static inline struct blkcg *task_blkcg(struct task_struct *tsk)
-{
-       return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
-}
-
-static inline struct blkcg *bio_blkcg(struct bio *bio)
-{
-       if (bio && bio->bi_css)
-               return css_to_blkcg(bio->bi_css);
-       return task_blkcg(current);
-}
-
-/**
- * blkcg_parent - get the parent of a blkcg
- * @blkcg: blkcg of interest
- *
- * Return the parent blkcg of @blkcg.  Can be called anytime.
- */
-static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
-{
-       return css_to_blkcg(blkcg->css.parent);
-}
-
-/**
- * blkg_to_pdata - get policy private data
- * @blkg: blkg of interest
- * @pol: policy of interest
- *
- * Return pointer to private data associated with the @blkg-@pol pair.
- */
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-                                                 struct blkcg_policy *pol)
-{
-       return blkg ? blkg->pd[pol->plid] : NULL;
-}
-
-static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
-                                                    struct blkcg_policy *pol)
-{
-       return blkcg ? blkcg->pd[pol->plid] : NULL;
-}
-
-/**
- * pdata_to_blkg - get blkg associated with policy private data
- * @pd: policy private data of interest
- *
- * @pd is policy private data.  Determine the blkg it's associated with.
- */
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
-{
-       return pd ? pd->blkg : NULL;
-}
-
-/**
- * blkg_path - format cgroup path of blkg
- * @blkg: blkg of interest
- * @buf: target buffer
- * @buflen: target buffer length
- *
- * Format the path of the cgroup of @blkg into @buf.
- */
-static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
-{
-       char *p;
-
-       p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
-       if (!p) {
-               strncpy(buf, "<unavailable>", buflen);
-               return -ENAMETOOLONG;
-       }
-
-       memmove(buf, p, buf + buflen - p);
-       return 0;
-}
-
-/**
- * blkg_get - get a blkg reference
- * @blkg: blkg to get
- *
- * The caller should be holding an existing reference.
- */
-static inline void blkg_get(struct blkcg_gq *blkg)
-{
-       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-       atomic_inc(&blkg->refcnt);
-}
-
-void __blkg_release_rcu(struct rcu_head *rcu);
-
-/**
- * blkg_put - put a blkg reference
- * @blkg: blkg to put
- */
-static inline void blkg_put(struct blkcg_gq *blkg)
-{
-       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
-       if (atomic_dec_and_test(&blkg->refcnt))
-               call_rcu(&blkg->rcu_head, __blkg_release_rcu);
-}
-
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-                              bool update_hint);
-
-/**
- * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
- * read locked.  If called under either blkcg or queue lock, the iteration
- * is guaranteed to include all and only online blkgs.  The caller may
- * update @pos_css by calling css_rightmost_descendant() to skip subtree.
- * @p_blkg is included in the iteration and the first node to be visited.
- */
-#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)          \
-       css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)   \
-               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
-                                             (p_blkg)->q, false)))
-
-/**
- * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
- * @d_blkg: loop cursor pointing to the current descendant
- * @pos_css: used for iteration
- * @p_blkg: target blkg to walk descendants of
- *
- * Similar to blkg_for_each_descendant_pre() but performs post-order
- * traversal instead.  Synchronization rules are the same.  @p_blkg is
- * included in the iteration and the last node to be visited.
- */
-#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)         \
-       css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)  \
-               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
-                                             (p_blkg)->q, false)))
-
-/**
- * blk_get_rl - get request_list to use
- * @q: request_queue of interest
- * @bio: bio which will be attached to the allocated request (may be %NULL)
- *
- * The caller wants to allocate a request from @q to use for @bio.  Find
- * the request_list to use and obtain a reference on it.  Should be called
- * under queue_lock.  This function is guaranteed to return non-%NULL
- * request_list.
- */
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-                                             struct bio *bio)
-{
-       struct blkcg *blkcg;
-       struct blkcg_gq *blkg;
-
-       rcu_read_lock();
-
-       blkcg = bio_blkcg(bio);
-
-       /* bypass blkg lookup and use @q->root_rl directly for root */
-       if (blkcg == &blkcg_root)
-               goto root_rl;
-
-       /*
-        * Try to use blkg->rl.  blkg lookup may fail under memory pressure
-        * or if either the blkcg or queue is going away.  Fall back to
-        * root_rl in such cases.
-        */
-       blkg = blkg_lookup_create(blkcg, q);
-       if (unlikely(IS_ERR(blkg)))
-               goto root_rl;
-
-       blkg_get(blkg);
-       rcu_read_unlock();
-       return &blkg->rl;
-root_rl:
-       rcu_read_unlock();
-       return &q->root_rl;
-}
-
-/**
- * blk_put_rl - put request_list
- * @rl: request_list to put
- *
- * Put the reference acquired by blk_get_rl().  Should be called under
- * queue_lock.
- */
-static inline void blk_put_rl(struct request_list *rl)
-{
-       /* root_rl may not have blkg set */
-       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
-               blkg_put(rl->blkg);
-}
-
-/**
- * blk_rq_set_rl - associate a request with a request_list
- * @rq: request of interest
- * @rl: target request_list
- *
- * Associate @rq with @rl so that accounting and freeing can know the
- * request_list @rq came from.
- */
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
-{
-       rq->rl = rl;
-}
-
-/**
- * blk_rq_rl - return the request_list a request came from
- * @rq: request of interest
- *
- * Return the request_list @rq is allocated from.
- */
-static inline struct request_list *blk_rq_rl(struct request *rq)
-{
-       return rq->rl;
-}
-
-struct request_list *__blk_queue_next_rl(struct request_list *rl,
-                                        struct request_queue *q);
-/**
- * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
- *
- * Should be used under queue_lock.
- */
-#define blk_queue_for_each_rl(rl, q)   \
-       for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
-
-static inline void blkg_stat_init(struct blkg_stat *stat)
-{
-       u64_stats_init(&stat->syncp);
-}
-
-/**
- * blkg_stat_add - add a value to a blkg_stat
- * @stat: target blkg_stat
- * @val: value to add
- *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
- */
-static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
-{
-       u64_stats_update_begin(&stat->syncp);
-       stat->cnt += val;
-       u64_stats_update_end(&stat->syncp);
-}
-
-/**
- * blkg_stat_read - read the current value of a blkg_stat
- * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
- */
-static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
-{
-       unsigned int start;
-       uint64_t v;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&stat->syncp);
-               v = stat->cnt;
-       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-       return v;
-}
-
-/**
- * blkg_stat_reset - reset a blkg_stat
- * @stat: blkg_stat to reset
- */
-static inline void blkg_stat_reset(struct blkg_stat *stat)
-{
-       stat->cnt = 0;
-}
-
-/**
- * blkg_stat_merge - merge a blkg_stat into another
- * @to: the destination blkg_stat
- * @from: the source
- *
- * Add @from's count to @to.
- */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
-{
-       blkg_stat_add(to, blkg_stat_read(from));
-}
-
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
-{
-       u64_stats_init(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_add - add a value to a blkg_rwstat
- * @rwstat: target blkg_rwstat
- * @rw: mask of REQ_{WRITE|SYNC}
- * @val: value to add
- *
- * Add @val to @rwstat.  The counters are chosen according to @rw.  The
- * caller is responsible for synchronizing calls to this function.
- */
-static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
-                                  int rw, uint64_t val)
-{
-       u64_stats_update_begin(&rwstat->syncp);
-
-       if (rw & REQ_WRITE)
-               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
-       else
-               rwstat->cnt[BLKG_RWSTAT_READ] += val;
-       if (rw & REQ_SYNC)
-               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
-       else
-               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
-
-       u64_stats_update_end(&rwstat->syncp);
-}
-
-/**
- * blkg_rwstat_read - read the current values of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
- */
-static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
-{
-       unsigned int start;
-       struct blkg_rwstat tmp;
-
-       do {
-               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-               tmp = *rwstat;
-       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
-
-       return tmp;
-}
-
-/**
- * blkg_rwstat_total - read the total count of a blkg_rwstat
- * @rwstat: blkg_rwstat to read
- *
- * Return the total count of @rwstat regardless of the IO direction.  This
- * function can be called without synchronization and takes care of u64
- * atomicity.
- */
-static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
-{
-       struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
-
-       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
-}
-
-/**
- * blkg_rwstat_reset - reset a blkg_rwstat
- * @rwstat: blkg_rwstat to reset
- */
-static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
-{
-       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
-}
-
-/**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
- * @to: the destination blkg_rwstat
- * @from: the source
- *
- * Add @from's counts to @to.
- */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-                                    struct blkg_rwstat *from)
-{
-       struct blkg_rwstat v = blkg_rwstat_read(from);
-       int i;
-
-       u64_stats_update_begin(&to->syncp);
-       for (i = 0; i < BLKG_RWSTAT_NR; i++)
-               to->cnt[i] += v.cnt[i];
-       u64_stats_update_end(&to->syncp);
-}
-
-#else  /* CONFIG_BLK_CGROUP */
-
-struct cgroup;
-struct blkcg;
-
-struct blkg_policy_data {
-};
-
-struct blkcg_policy_data {
-};
-
-struct blkcg_gq {
-};
-
-struct blkcg_policy {
-};
-
-static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
-static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
-static inline void blkcg_drain_queue(struct request_queue *q) { }
-static inline void blkcg_exit_queue(struct request_queue *q) { }
-static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
-static inline int blkcg_activate_policy(struct request_queue *q,
-                                       const struct blkcg_policy *pol) { return 0; }
-static inline void blkcg_deactivate_policy(struct request_queue *q,
-                                          const struct blkcg_policy *pol) { }
-
-static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
-
-static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
-                                                 struct blkcg_policy *pol) { return NULL; }
-static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
-static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
-static inline void blkg_get(struct blkcg_gq *blkg) { }
-static inline void blkg_put(struct blkcg_gq *blkg) { }
-
-static inline struct request_list *blk_get_rl(struct request_queue *q,
-                                             struct bio *bio) { return &q->root_rl; }
-static inline void blk_put_rl(struct request_list *rl) { }
-static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
-static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
-
-#define blk_queue_for_each_rl(rl, q)   \
-       for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
-
-#endif /* CONFIG_BLK_CGROUP */
-#endif /* _BLK_CGROUP_H */
index f6ab750060fe019f97d0ccfbca367b9e6cd3b426..688ae9482cb8eab438d3bbcaf6d61602a366cfc8 100644 (file)
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@ -63,6 +63,31 @@ struct kmem_cache *blk_requestq_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
+static void blk_clear_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+       clear_wb_congested(rl->blkg->wb_congested, sync);
+#else
+       /*
+        * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+        * flip its congestion state for events on other blkcgs.
+        */
+       if (rl == &rl->q->root_rl)
+               clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
+static void blk_set_congested(struct request_list *rl, int sync)
+{
+#ifdef CONFIG_CGROUP_WRITEBACK
+       set_wb_congested(rl->blkg->wb_congested, sync);
+#else
+       /* see blk_clear_congested() */
+       if (rl == &rl->q->root_rl)
+               set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+#endif
+}
+
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
        int nr;
@@ -623,8 +648,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-       q->backing_dev_info.state = 0;
-       q->backing_dev_info.capabilities = 0;
+       q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
        q->backing_dev_info.name = "block";
        q->node = node_id;
 
@@ -847,13 +871,8 @@ static void __freed_request(struct request_list *rl, int sync)
 {
        struct request_queue *q = rl->q;
 
-       /*
-        * bdi isn't aware of blkcg yet.  As all async IOs end up root
-        * blkcg anyway, just use root blkcg state.
-        */
-       if (rl == &q->root_rl &&
-           rl->count[sync] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, sync);
+       if (rl->count[sync] < queue_congestion_off_threshold(q))
+               blk_clear_congested(rl, sync);
 
        if (rl->count[sync] + 1 <= q->nr_requests) {
                if (waitqueue_active(&rl->wait[sync]))
@@ -886,25 +905,25 @@ static void freed_request(struct request_list *rl, unsigned int flags)
 int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
        struct request_list *rl;
+       int on_thresh, off_thresh;
 
        spin_lock_irq(q->queue_lock);
        q->nr_requests = nr;
        blk_queue_congestion_threshold(q);
+       on_thresh = queue_congestion_on_threshold(q);
+       off_thresh = queue_congestion_off_threshold(q);
 
-       /* congestion isn't cgroup aware and follows root blkcg for now */
-       rl = &q->root_rl;
-
-       if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_SYNC);
-       else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_SYNC);
+       blk_queue_for_each_rl(rl, q) {
+               if (rl->count[BLK_RW_SYNC] >= on_thresh)
+                       blk_set_congested(rl, BLK_RW_SYNC);
+               else if (rl->count[BLK_RW_SYNC] < off_thresh)
+                       blk_clear_congested(rl, BLK_RW_SYNC);
 
-       if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_ASYNC);
-       else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_ASYNC);
+               if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+                       blk_set_congested(rl, BLK_RW_ASYNC);
+               else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+                       blk_clear_congested(rl, BLK_RW_ASYNC);
 
-       blk_queue_for_each_rl(rl, q) {
                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
                        blk_set_rl_full(rl, BLK_RW_SYNC);
                } else {
@@ -1014,12 +1033,7 @@ static struct request *__get_request(struct request_list *rl, int rw_flags,
                                }
                        }
                }
-               /*
-                * bdi isn't aware of blkcg yet.  As all async IOs end up
-                * root blkcg anyway, just use root blkcg state.
-                */
-               if (rl == &q->root_rl)
-                       blk_set_queue_congested(q, is_sync);
+               blk_set_congested(rl, is_sync);
        }
 
        /*
index 79ffb4855af048462c8c44b3d0a7a493926989bc..f548b64be09242a77f7db6ac7cedd5f68ee397b7 100644 (file)
@@ -21,6 +21,7 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/scatterlist.h>
index 2b8fd302f677a967d87994f8a7532aab8dfe6569..6264b382d4d1ba8765dc3b22cead4fd9bf384d99 100644 (file)
@@ -6,11 +6,12 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
+#include <linux/blk-cgroup.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 #include "blk-mq.h"
 
 struct queue_sysfs_entry {
index 5b9c6d5c3636ad6412c7b898c44e4709b35597d1..b23193518ac7a964a9f0d75b26c6fa844f6eea40 100644 (file)
@@ -9,7 +9,7 @@
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
-#include "blk-cgroup.h"
+#include <linux/blk-cgroup.h>
 #include "blk.h"
 
 /* Max dispatch from a group in 1 round */
index 3ab0bce1c947ef9be81f09139aa73d9bd4b76ff5..b17311227c12764f18760ee4ce71fa828f939f45 100644 (file)
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
index d8ad45ccd8fa784a60dac66d91158eb0c9065b27..c62bb2e650b8c741e64ead5c9f32b090cbf19730 100644 (file)
@@ -14,8 +14,8 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include <linux/blk-cgroup.h>
 #include "blk.h"
-#include "blk-cgroup.h"
 
 /*
  * tunables
index 942579d04128b5484f2d3e53bf38b4994ef852ee..84d63943f2de2f386ff35e6a395f68ada173b5b2 100644 (file)
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 #include <linux/pm_runtime.h>
+#include <linux/blk-cgroup.h>
 
 #include <trace/events/block.h>
 
 #include "blk.h"
-#include "blk-cgroup.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
index ea982eadaf6380b974d6b1d39a7197085217ac91..59a1395eedac45e3e5d6326ed2956caf8a7de7c8 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/kdev_t.h>
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
index b905e9888b888eb4b72fd10bae06eb06950e760b..efd19c2da9c29bab20fddff11f7df9678a3b1d55 100644 (file)
@@ -38,6 +38,7 @@
 #include <linux/mutex.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/genhd.h>
 #include <linux/idr.h>
 #include <net/tcp.h>
index 81fde9ef7f8e0e21e0d8bba275e14bdde5f90884..a1518539b8580420ea1d278c1daeeeb3e3721ae8 100644 (file)
@@ -2359,7 +2359,7 @@ static void drbd_cleanup(void)
  * @congested_data:    User data
  * @bdi_bits:          Bits the BDI flusher thread is currently interested in
  *
- * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested.
+ * Returns 1<<WB_async_congested and/or 1<<WB_sync_congested if we are congested.
  */
 static int drbd_congested(void *congested_data, int bdi_bits)
 {
@@ -2376,14 +2376,14 @@ static int drbd_congested(void *congested_data, int bdi_bits)
        }
 
        if (test_bit(CALLBACK_PENDING, &first_peer_device(device)->connection->flags)) {
-               r |= (1 << BDI_async_congested);
+               r |= (1 << WB_async_congested);
                /* Without good local data, we would need to read from remote,
                 * and that would need the worker thread as well, which is
                 * currently blocked waiting for that usermode helper to
                 * finish.
                 */
                if (!get_ldev_if_state(device, D_UP_TO_DATE))
-                       r |= (1 << BDI_sync_congested);
+                       r |= (1 << WB_sync_congested);
                else
                        put_ldev(device);
                r &= bdi_bits;
@@ -2399,9 +2399,9 @@ static int drbd_congested(void *congested_data, int bdi_bits)
                        reason = 'b';
        }
 
-       if (bdi_bits & (1 << BDI_async_congested) &&
+       if (bdi_bits & (1 << WB_async_congested) &&
            test_bit(NET_CONGESTED, &first_peer_device(device)->connection->flags)) {
-               r |= (1 << BDI_async_congested);
+               r |= (1 << WB_async_congested);
                reason = reason == 'b' ? 'a' : 'n';
        }
 
index 09e628dafd9d829eadd9abb68c6956d182a582f8..4c20c228184c321694f8b1e408f50afecc0b07ba 100644 (file)
@@ -61,6 +61,7 @@
 #include <linux/freezer.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_ioctl.h>
 #include <scsi/scsi.h>
index 5fc291c6157e3121f57d0a30e8d52a5021e3253c..60316fbaf2957d1bbbee14c67072920170954170 100644 (file)
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/raw.h>
 #include <linux/capability.h>
index 1616f668a4cb043741520d724b2f3602da0e07be..4afb2d26b148a3a41ca55171dee89d75da64e4f4 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/hash.h>
 #include <linux/random.h>
+#include <linux/backing-dev.h>
 
 #include <trace/events/bcache.h>
 
index 4d6f089a0e9e2eca5b8fa58017a29e1da598c2a0..d72829922eb6c8a2c81f2892f7c265a5bb0d9f24 100644 (file)
@@ -2080,7 +2080,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
                         * the query about congestion status of request_queue
                         */
                        if (dm_request_based(md))
-                               r = md->queue->backing_dev_info.state &
+                               r = md->queue->backing_dev_info.wb.state &
                                    bdi_bits;
                        else
                                r = dm_table_any_congested(map, bdi_bits);
index e6e66d087b2696ae8671631978b2fffe3c80f49e..7fff744f0865b4f7a4ad4aa8965555ba22c6706e 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/device-mapper.h>
 #include <linux/list.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/hdreg.h>
 #include <linux/completion.h>
 #include <linux/kobject.h>
index 4046a6c6f223b0b4a923404c52c582308843b887..7da6e9c3cb53eb7d28ab44f6d40dbcea63e2d486 100644 (file)
@@ -16,6 +16,7 @@
 #define _MD_MD_H
 
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/kobject.h>
 #include <linux/list.h>
 #include <linux/mm.h>
index 9157a29c8dbf133c713d67bdaaea45aaf650a255..f80f1af61ce70bce15a72d242d87c1c34da49a79 100644 (file)
@@ -745,7 +745,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
        struct r1conf *conf = mddev->private;
        int i, ret = 0;
 
-       if ((bits & (1 << BDI_async_congested)) &&
+       if ((bits & (1 << WB_async_congested)) &&
            conf->pending_count >= max_queued_requests)
                return 1;
 
@@ -760,7 +760,7 @@ static int raid1_congested(struct mddev *mddev, int bits)
                        /* Note the '|| 1' - when read_balance prefers
                         * non-congested targets, it can be removed
                         */
-                       if ((bits & (1<<BDI_async_congested)) || 1)
+                       if ((bits & (1 << WB_async_congested)) || 1)
                                ret |= bdi_congested(&q->backing_dev_info, bits);
                        else
                                ret &= bdi_congested(&q->backing_dev_info, bits);
index f55c3f35b7463141086afb727785c775c5185d76..188d8e9a6bdcc39e4da54095466f45683d6b2177 100644 (file)
@@ -914,7 +914,7 @@ static int raid10_congested(struct mddev *mddev, int bits)
        struct r10conf *conf = mddev->private;
        int i, ret = 0;
 
-       if ((bits & (1 << BDI_async_congested)) &&
+       if ((bits & (1 << WB_async_congested)) &&
            conf->pending_count >= max_queued_requests)
                return 1;
 
index b16f3cda97ff0a6c12fb9ef3d8830603f5e5dc39..e2c0057737e67473126f00af3d693ab790c612a9 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/list.h>
index d72605864b0a66adecf278a21a3ae8efc9c9255c..14562788e4e059eca40f57ee85fd74a5c9f41fa0 100644 (file)
@@ -55,9 +55,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
        if (PagePrivate(page))
                page->mapping->a_ops->invalidatepage(page, 0, PAGE_CACHE_SIZE);
 
-       if (TestClearPageDirty(page))
-               account_page_cleaned(page, mapping);
-
+       cancel_dirty_page(page);
        ClearPageMappedToDisk(page);
        ll_delete_from_page_cache(page);
 }
index 620d93489539837b5a00b6ad723e22f68dcd329b..8aa56bb6e8619721e69891ac293df75ac42efe06 100644 (file)
@@ -320,31 +320,21 @@ fail_option_alloc:
 struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
                  const char *dev_name, char *data)
 {
-       int retval = -EINVAL;
        struct p9_fid *fid;
-       int rc;
+       int rc = -ENOMEM;
 
        v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL);
        if (!v9ses->uname)
-               return ERR_PTR(-ENOMEM);
+               goto err_names;
 
        v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL);
-       if (!v9ses->aname) {
-               kfree(v9ses->uname);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!v9ses->aname)
+               goto err_names;
        init_rwsem(&v9ses->rename_sem);
 
        rc = bdi_setup_and_register(&v9ses->bdi, "9p");
-       if (rc) {
-               kfree(v9ses->aname);
-               kfree(v9ses->uname);
-               return ERR_PTR(rc);
-       }
-
-       spin_lock(&v9fs_sessionlist_lock);
-       list_add(&v9ses->slist, &v9fs_sessionlist);
-       spin_unlock(&v9fs_sessionlist_lock);
+       if (rc)
+               goto err_names;
 
        v9ses->uid = INVALID_UID;
        v9ses->dfltuid = V9FS_DEFUID;
@@ -352,10 +342,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
 
        v9ses->clnt = p9_client_create(dev_name, data);
        if (IS_ERR(v9ses->clnt)) {
-               retval = PTR_ERR(v9ses->clnt);
-               v9ses->clnt = NULL;
+               rc = PTR_ERR(v9ses->clnt);
                p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n");
-               goto error;
+               goto err_bdi;
        }
 
        v9ses->flags = V9FS_ACCESS_USER;
@@ -368,10 +357,8 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        }
 
        rc = v9fs_parse_options(v9ses, data);
-       if (rc < 0) {
-               retval = rc;
-               goto error;
-       }
+       if (rc < 0)
+               goto err_clnt;
 
        v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ;
 
@@ -405,10 +392,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID,
                                                        v9ses->aname);
        if (IS_ERR(fid)) {
-               retval = PTR_ERR(fid);
-               fid = NULL;
+               rc = PTR_ERR(fid);
                p9_debug(P9_DEBUG_ERROR, "cannot attach\n");
-               goto error;
+               goto err_clnt;
        }
 
        if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE)
@@ -420,12 +406,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        /* register the session for caching */
        v9fs_cache_session_get_cookie(v9ses);
 #endif
+       spin_lock(&v9fs_sessionlist_lock);
+       list_add(&v9ses->slist, &v9fs_sessionlist);
+       spin_unlock(&v9fs_sessionlist_lock);
 
        return fid;
 
-error:
+err_clnt:
+       p9_client_destroy(v9ses->clnt);
+err_bdi:
        bdi_destroy(&v9ses->bdi);
-       return ERR_PTR(retval);
+err_names:
+       kfree(v9ses->uname);
+       kfree(v9ses->aname);
+       return ERR_PTR(rc);
 }
 
 /**
index e99a338a46384128f32d6a32af8e549319f5121a..bf495cedec26a2aad7ff514a25d7609499b7d4dc 100644 (file)
@@ -130,11 +130,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
        fid = v9fs_session_init(v9ses, dev_name, data);
        if (IS_ERR(fid)) {
                retval = PTR_ERR(fid);
-               /*
-                * we need to call session_close to tear down some
-                * of the data structure setup by session_init
-                */
-               goto close_session;
+               goto free_session;
        }
 
        sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses);
@@ -195,8 +191,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags,
 
 clunk_fid:
        p9_client_clunk(fid);
-close_session:
        v9fs_session_close(v9ses);
+free_session:
        kfree(v9ses);
        return ERR_PTR(retval);
 
index c7e4163ede87f3370a235bdb929cf26e5cd487e8..f04c873a7365c1b50c88cf7116135597213d64bb 100644 (file)
@@ -14,6 +14,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/highmem.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
 #include <linux/magic.h>
@@ -546,7 +547,8 @@ static struct file_system_type bd_type = {
        .kill_sb        = kill_anon_super,
 };
 
-static struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __read_mostly;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
 
 void __init bdev_cache_init(void)
 {
@@ -687,11 +689,6 @@ static struct block_device *bd_acquire(struct inode *inode)
        return bdev;
 }
 
-int sb_is_blkdev_sb(struct super_block *sb)
-{
-       return sb == blockdev_superblock;
-}
-
 /* Call when you free inode */
 
 void bd_forget(struct inode *inode)
index f96173ad62d90413662e5ff15b14ed0df854ba5a..1cf7a53a02771eb1ebdc49564d6f42aef76a449c 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/quotaops.h>
 #include <linux/highmem.h>
 #include <linux/export.h>
+#include <linux/backing-dev.h>
 #include <linux/writeback.h>
 #include <linux/hash.h>
 #include <linux/suspend.h>
@@ -44,6 +45,9 @@
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+                        unsigned long bio_flags,
+                        struct writeback_control *wbc);
 
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
 
@@ -623,21 +627,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
  *
  * If warn is true, then emit a warning if the page is not uptodate and has
  * not been truncated.
+ *
+ * The caller must hold mem_cgroup_begin_page_stat() lock.
  */
-static void __set_page_dirty(struct page *page,
-               struct address_space *mapping, int warn)
+static void __set_page_dirty(struct page *page, struct address_space *mapping,
+                            struct mem_cgroup *memcg, int warn)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&mapping->tree_lock, flags);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
-               account_page_dirtied(page, mapping);
+               account_page_dirtied(page, mapping, memcg);
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
        spin_unlock_irqrestore(&mapping->tree_lock, flags);
-       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
 }
 
 /*
@@ -668,6 +673,7 @@ static void __set_page_dirty(struct page *page,
 int __set_page_dirty_buffers(struct page *page)
 {
        int newly_dirty;
+       struct mem_cgroup *memcg;
        struct address_space *mapping = page_mapping(page);
 
        if (unlikely(!mapping))
@@ -683,11 +689,22 @@ int __set_page_dirty_buffers(struct page *page)
                        bh = bh->b_this_page;
                } while (bh != head);
        }
+       /*
+        * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+        * per-memcg dirty page counters.
+        */
+       memcg = mem_cgroup_begin_page_stat(page);
        newly_dirty = !TestSetPageDirty(page);
        spin_unlock(&mapping->private_lock);
 
        if (newly_dirty)
-               __set_page_dirty(page, mapping, 1);
+               __set_page_dirty(page, mapping, memcg, 1);
+
+       mem_cgroup_end_page_stat(memcg);
+
+       if (newly_dirty)
+               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
        return newly_dirty;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -1158,11 +1175,18 @@ void mark_buffer_dirty(struct buffer_head *bh)
 
        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
+               struct address_space *mapping = NULL;
+               struct mem_cgroup *memcg;
+
+               memcg = mem_cgroup_begin_page_stat(page);
                if (!TestSetPageDirty(page)) {
-                       struct address_space *mapping = page_mapping(page);
+                       mapping = page_mapping(page);
                        if (mapping)
-                               __set_page_dirty(page, mapping, 0);
+                               __set_page_dirty(page, mapping, memcg, 0);
                }
+               mem_cgroup_end_page_stat(memcg);
+               if (mapping)
+                       __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
 }
 EXPORT_SYMBOL(mark_buffer_dirty);
@@ -1684,8 +1708,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        struct buffer_head *bh, *head;
        unsigned int blocksize, bbits;
        int nr_underway = 0;
-       int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
-                       WRITE_SYNC : WRITE);
+       int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
 
        head = create_page_buffers(page, inode,
                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1774,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        do {
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
-                       submit_bh(write_op, bh);
+                       submit_bh_wbc(write_op, bh, 0, wbc);
                        nr_underway++;
                }
                bh = next;
@@ -1828,7 +1851,7 @@ recover:
                struct buffer_head *next = bh->b_this_page;
                if (buffer_async_write(bh)) {
                        clear_buffer_dirty(bh);
-                       submit_bh(write_op, bh);
+                       submit_bh_wbc(write_op, bh, 0, wbc);
                        nr_underway++;
                }
                bh = next;
@@ -2993,7 +3016,8 @@ void guard_bio_eod(int rw, struct bio *bio)
        }
 }
 
-int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+static int submit_bh_wbc(int rw, struct buffer_head *bh,
+                        unsigned long bio_flags, struct writeback_control *wbc)
 {
        struct bio *bio;
 
@@ -3015,6 +3039,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
         */
        bio = bio_alloc(GFP_NOIO, 1);
 
+       if (wbc) {
+               wbc_init_bio(wbc, bio);
+               wbc_account_io(wbc, bh->b_page, bh->b_size);
+       }
+
        bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
        bio->bi_bdev = bh->b_bdev;
        bio->bi_io_vec[0].bv_page = bh->b_page;
@@ -3039,11 +3068,16 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
        submit_bio(rw, bio);
        return 0;
 }
+
+int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
+{
+       return submit_bh_wbc(rw, bh, bio_flags, NULL);
+}
 EXPORT_SYMBOL_GPL(_submit_bh);
 
 int submit_bh(int rw, struct buffer_head *bh)
 {
-       return _submit_bh(rw, bh, 0);
+       return submit_bh_wbc(rw, bh, 0, NULL);
 }
 EXPORT_SYMBOL(submit_bh);
 
@@ -3232,8 +3266,8 @@ int try_to_free_buffers(struct page *page)
         * to synchronise against __set_page_dirty_buffers and prevent the
         * dirty bit from being lost.
         */
-       if (ret && TestClearPageDirty(page))
-               account_page_cleaned(page, mapping);
+       if (ret)
+               cancel_dirty_page(page);
        spin_unlock(&mapping->private_lock);
 out:
        if (buffers_to_free) {
index d0e746e965118f9dd6410f1b65396e5650cf483a..900e19cf9ef6fbb4057e95f2cc1f81c44ccd3377 100644 (file)
@@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
                 MS_POSIXACL : 0);
+       sb->s_iflags |= SB_I_CGROUPWB;
 
        if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
            (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
index d86d2622f82631f3fa46abe1047caf319b968402..aadb7282883493597f8dae099f20c3e86694bea8 100644 (file)
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
+#include <linux/backing-dev.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "xattr.h"
index 1c535fa67640da69def57f0e88f5c8d5e233c0bc..f6aedf88da437ee324c314bb1020baa51db0423c 100644 (file)
@@ -26,6 +26,7 @@
 #include <linux/log2.h>
 #include <linux/module.h>
 #include <linux/slab.h>
+#include <linux/backing-dev.h>
 #include <trace/events/ext4.h>
 
 #ifdef CONFIG_EXT4_DEBUG
index 90ec13fe8ac73e5d81cda8ce4d267afbf2ff571a..a7b4b6e1026920823a149b2f124371cba092e387 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
index d9c52424bac21555f5a84a2f8ef9f7f2dc7751c5..7dd63b794bfb5a04ae0d203c9ed2c8e739f0eb08 100644 (file)
@@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
                                                        PAGE_CACHE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
        } else if (type == DIRTY_DENTS) {
-               if (sbi->sb->s_bdi->dirty_exceeded)
+               if (sbi->sb->s_bdi->wb.dirty_exceeded)
                        return false;
                mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
                                sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
        } else {
-               if (sbi->sb->s_bdi->dirty_exceeded)
+               if (sbi->sb->s_bdi->wb.dirty_exceeded)
                        return false;
        }
        return res;
index 8496357781188188126c1de28afc55f347d10198..79e7b879a75321047bf00fd22d6b55d37a270af7 100644 (file)
@@ -9,6 +9,7 @@
  * published by the Free Software Foundation.
  */
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 /* constant macro */
 #define NULL_SEGNO                     ((unsigned int)(~0))
@@ -714,7 +715,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
  */
 static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 {
-       if (sbi->sb->s_bdi->dirty_exceeded)
+       if (sbi->sb->s_bdi->wb.dirty_exceeded)
                return 0;
 
        if (type == DATA)
index 442d50a0e33e6a37daf2c315a2a424f43ddfcdaa..a08f1039909a76e6427cf9e64ddf16c30a1716c9 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
 #include "fat.h"
index c06774658345effe930825df56caedb1c296e97e..509411dd3698959c0b8c387dcf0f487b5da12abe 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/parser.h>
 #include <linux/uio.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <asm/unaligned.h>
 #include "fat.h"
 
index 32a8bbd7a9ad1121f9b100da08f80500736bcebf..f0520bcf209442914eff0ed60d380fb3d2c66402 100644 (file)
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
 #include <linux/device.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 /*
  */
 #define MIN_WRITEBACK_PAGES    (4096UL >> (PAGE_CACHE_SHIFT - 10))
 
+struct wb_completion {
+       atomic_t                cnt;
+};
+
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -47,12 +52,28 @@ struct wb_writeback_work {
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
        unsigned int for_sync:1;        /* sync(2) WB_SYNC_ALL writeback */
+       unsigned int auto_free:1;       /* free on completion */
+       unsigned int single_wait:1;
+       unsigned int single_done:1;
        enum wb_reason reason;          /* why was writeback initiated? */
 
        struct list_head list;          /* pending work list */
-       struct completion *done;        /* set if the caller waits */
+       struct wb_completion *done;     /* set if the caller waits */
 };
 
+/*
+ * If one wants to wait for one or more wb_writeback_works, each work's
+ * ->done should be set to a wb_completion defined using the following
+ * macro.  Once all work items are issued with wb_queue_work(), the caller
+ * can wait for the completion of all using wb_wait_for_completion().  Work
+ * items which are waited upon aren't freed automatically on completion.
+ */
+#define DEFINE_WB_COMPLETION_ONSTACK(cmpl)                             \
+       struct wb_completion cmpl = {                                   \
+               .cnt            = ATOMIC_INIT(1),                       \
+       }
+
+
 /*
  * If an inode is constantly having its pages dirtied, but then the
  * updates stop dirtytime_expire_interval seconds in the past, it's
@@ -65,35 +86,6 @@ struct wb_writeback_work {
  */
 unsigned int dirtytime_expire_interval = 12 * 60 * 60;
 
-/**
- * writeback_in_progress - determine whether there is writeback in progress
- * @bdi: the device's backing_dev_info structure.
- *
- * Determine whether there is writeback waiting to be handled against a
- * backing device.
- */
-int writeback_in_progress(struct backing_dev_info *bdi)
-{
-       return test_bit(BDI_writeback_running, &bdi->state);
-}
-EXPORT_SYMBOL(writeback_in_progress);
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode)
-{
-       struct super_block *sb;
-
-       if (!inode)
-               return &noop_backing_dev_info;
-
-       sb = inode->i_sb;
-#ifdef CONFIG_BLOCK
-       if (sb_is_blkdev_sb(sb))
-               return blk_get_backing_dev_info(I_BDEV(inode));
-#endif
-       return sb->s_bdi;
-}
-EXPORT_SYMBOL_GPL(inode_to_bdi);
-
 static inline struct inode *wb_inode(struct list_head *head)
 {
        return list_entry(head, struct inode, i_wb_list);
@@ -109,45 +101,830 @@ static inline struct inode *wb_inode(struct list_head *head)
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage);
 
-static void bdi_wakeup_thread(struct backing_dev_info *bdi)
+static bool wb_io_lists_populated(struct bdi_writeback *wb)
+{
+       if (wb_has_dirty_io(wb)) {
+               return false;
+       } else {
+               set_bit(WB_has_dirty_io, &wb->state);
+               WARN_ON_ONCE(!wb->avg_write_bandwidth);
+               atomic_long_add(wb->avg_write_bandwidth,
+                               &wb->bdi->tot_write_bandwidth);
+               return true;
+       }
+}
+
+static void wb_io_lists_depopulated(struct bdi_writeback *wb)
 {
-       spin_lock_bh(&bdi->wb_lock);
-       if (test_bit(BDI_registered, &bdi->state))
-               mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-       spin_unlock_bh(&bdi->wb_lock);
+       if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) &&
+           list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) {
+               clear_bit(WB_has_dirty_io, &wb->state);
+               WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth,
+                                       &wb->bdi->tot_write_bandwidth) < 0);
+       }
 }
 
-static void bdi_queue_work(struct backing_dev_info *bdi,
-                          struct wb_writeback_work *work)
+/**
+ * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list
+ * @inode: inode to be moved
+ * @wb: target bdi_writeback
+ * @head: one of @wb->b_{dirty|io|more_io}
+ *
+ * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io.
+ * Returns %true if @inode is the first occupant of the !dirty_time IO
+ * lists; otherwise, %false.
+ */
+static bool inode_wb_list_move_locked(struct inode *inode,
+                                     struct bdi_writeback *wb,
+                                     struct list_head *head)
 {
-       trace_writeback_queue(bdi, work);
+       assert_spin_locked(&wb->list_lock);
+
+       list_move(&inode->i_wb_list, head);
 
-       spin_lock_bh(&bdi->wb_lock);
-       if (!test_bit(BDI_registered, &bdi->state)) {
-               if (work->done)
-                       complete(work->done);
+       /* dirty_time doesn't count as dirty_io until expiration */
+       if (head != &wb->b_dirty_time)
+               return wb_io_lists_populated(wb);
+
+       wb_io_lists_depopulated(wb);
+       return false;
+}
+
+/**
+ * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list
+ * @inode: inode to be removed
+ * @wb: bdi_writeback @inode is being removed from
+ *
+ * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and
+ * clear %WB_has_dirty_io if all are empty afterwards.
+ */
+static void inode_wb_list_del_locked(struct inode *inode,
+                                    struct bdi_writeback *wb)
+{
+       assert_spin_locked(&wb->list_lock);
+
+       list_del_init(&inode->i_wb_list);
+       wb_io_lists_depopulated(wb);
+}
+
+static void wb_wakeup(struct bdi_writeback *wb)
+{
+       spin_lock_bh(&wb->work_lock);
+       if (test_bit(WB_registered, &wb->state))
+               mod_delayed_work(bdi_wq, &wb->dwork, 0);
+       spin_unlock_bh(&wb->work_lock);
+}
+
+static void wb_queue_work(struct bdi_writeback *wb,
+                         struct wb_writeback_work *work)
+{
+       trace_writeback_queue(wb->bdi, work);
+
+       spin_lock_bh(&wb->work_lock);
+       if (!test_bit(WB_registered, &wb->state)) {
+               if (work->single_wait)
+                       work->single_done = 1;
                goto out_unlock;
        }
-       list_add_tail(&work->list, &bdi->work_list);
-       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+       if (work->done)
+               atomic_inc(&work->done->cnt);
+       list_add_tail(&work->list, &wb->work_list);
+       mod_delayed_work(bdi_wq, &wb->dwork, 0);
 out_unlock:
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_unlock_bh(&wb->work_lock);
+}
+
+/**
+ * wb_wait_for_completion - wait for completion of bdi_writeback_works
+ * @bdi: bdi work items were issued to
+ * @done: target wb_completion
+ *
+ * Wait for one or more work items issued to @bdi with their ->done field
+ * set to @done, which should have been defined with
+ * DEFINE_WB_COMPLETION_ONSTACK().  This function returns after all such
+ * work items are completed.  Work items which are waited upon aren't freed
+ * automatically on completion.
+ */
+static void wb_wait_for_completion(struct backing_dev_info *bdi,
+                                  struct wb_completion *done)
+{
+       atomic_dec(&done->cnt);         /* put down the initial count */
+       wait_event(bdi->wb_waitq, !atomic_read(&done->cnt));
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/* parameters for foreign inode detection, see wb_detach_inode() */
+#define WB_FRN_TIME_SHIFT      13      /* 1s = 2^13, upto 8 secs w/ 16bit */
+#define WB_FRN_TIME_AVG_SHIFT  3       /* avg = avg * 7/8 + new * 1/8 */
+#define WB_FRN_TIME_CUT_DIV    2       /* ignore rounds < avg / 2 */
+#define WB_FRN_TIME_PERIOD     (2 * (1 << WB_FRN_TIME_SHIFT))  /* 2s */
+
+#define WB_FRN_HIST_SLOTS      16      /* inode->i_wb_frn_history is 16bit */
+#define WB_FRN_HIST_UNIT       (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS)
+                                       /* each slot's duration is 2s / 16 */
+#define WB_FRN_HIST_THR_SLOTS  (WB_FRN_HIST_SLOTS / 2)
+                                       /* if foreign slots >= 8, switch */
+#define WB_FRN_HIST_MAX_SLOTS  (WB_FRN_HIST_THR_SLOTS / 2 + 1)
+                                       /* one round can affect upto 5 slots */
+
+void __inode_attach_wb(struct inode *inode, struct page *page)
+{
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct bdi_writeback *wb = NULL;
+
+       if (inode_cgwb_enabled(inode)) {
+               struct cgroup_subsys_state *memcg_css;
+
+               if (page) {
+                       memcg_css = mem_cgroup_css_from_page(page);
+                       wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+               } else {
+                       /* must pin memcg_css, see wb_get_create() */
+                       memcg_css = task_get_css(current, memory_cgrp_id);
+                       wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+                       css_put(memcg_css);
+               }
+       }
+
+       if (!wb)
+               wb = &bdi->wb;
+
+       /*
+        * There may be multiple instances of this function racing to
+        * update the same inode.  Use cmpxchg() to tell the winner.
+        */
+       if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
+               wb_put(wb);
+}
+
+/**
+ * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
+ * @inode: inode of interest with i_lock held
+ *
+ * Returns @inode's wb with its list_lock held.  @inode->i_lock must be
+ * held on entry and is released on return.  The returned wb is guaranteed
+ * to stay @inode's associated wb until its list_lock is released.
+ */
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+       __releases(&inode->i_lock)
+       __acquires(&wb->list_lock)
+{
+       while (true) {
+               struct bdi_writeback *wb = inode_to_wb(inode);
+
+               /*
+                * inode_to_wb() association is protected by both
+                * @inode->i_lock and @wb->list_lock but list_lock nests
+                * outside i_lock.  Drop i_lock and verify that the
+                * association hasn't changed after acquiring list_lock.
+                */
+               wb_get(wb);
+               spin_unlock(&inode->i_lock);
+               spin_lock(&wb->list_lock);
+               wb_put(wb);             /* not gonna deref it anymore */
+
+               /* i_wb may have changed inbetween, can't use inode_to_wb() */
+               if (likely(wb == inode->i_wb))
+                       return wb;      /* @inode already has ref */
+
+               spin_unlock(&wb->list_lock);
+               cpu_relax();
+               spin_lock(&inode->i_lock);
+       }
+}
+
+/**
+ * inode_to_wb_and_lock_list - determine an inode's wb and lock it
+ * @inode: inode of interest
+ *
+ * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held
+ * on entry.
+ */
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+       __acquires(&wb->list_lock)
+{
+       spin_lock(&inode->i_lock);
+       return locked_inode_to_wb_and_lock_list(inode);
+}
+
+struct inode_switch_wbs_context {
+       struct inode            *inode;
+       struct bdi_writeback    *new_wb;
+
+       struct rcu_head         rcu_head;
+       struct work_struct      work;
+};
+
+static void inode_switch_wbs_work_fn(struct work_struct *work)
+{
+       struct inode_switch_wbs_context *isw =
+               container_of(work, struct inode_switch_wbs_context, work);
+       struct inode *inode = isw->inode;
+       struct address_space *mapping = inode->i_mapping;
+       struct bdi_writeback *old_wb = inode->i_wb;
+       struct bdi_writeback *new_wb = isw->new_wb;
+       struct radix_tree_iter iter;
+       bool switched = false;
+       void **slot;
+
+       /*
+        * By the time control reaches here, RCU grace period has passed
+        * since I_WB_SWITCH assertion and all wb stat update transactions
+        * between unlocked_inode_to_wb_begin/end() are guaranteed to be
+        * synchronizing against mapping->tree_lock.
+        *
+        * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock
+        * gives us exclusion against all wb related operations on @inode
+        * including IO list manipulations and stat updates.
+        */
+       if (old_wb < new_wb) {
+               spin_lock(&old_wb->list_lock);
+               spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING);
+       } else {
+               spin_lock(&new_wb->list_lock);
+               spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING);
+       }
+       spin_lock(&inode->i_lock);
+       spin_lock_irq(&mapping->tree_lock);
+
+       /*
+        * Once I_FREEING is visible under i_lock, the eviction path owns
+        * the inode and we shouldn't modify ->i_wb_list.
+        */
+       if (unlikely(inode->i_state & I_FREEING))
+               goto skip_switch;
+
+       /*
+        * Count and transfer stats.  Note that PAGECACHE_TAG_DIRTY points
+        * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to
+        * pages actually under underwriteback.
+        */
+       radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+                                  PAGECACHE_TAG_DIRTY) {
+               struct page *page = radix_tree_deref_slot_protected(slot,
+                                                       &mapping->tree_lock);
+               if (likely(page) && PageDirty(page)) {
+                       __dec_wb_stat(old_wb, WB_RECLAIMABLE);
+                       __inc_wb_stat(new_wb, WB_RECLAIMABLE);
+               }
+       }
+
+       radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0,
+                                  PAGECACHE_TAG_WRITEBACK) {
+               struct page *page = radix_tree_deref_slot_protected(slot,
+                                                       &mapping->tree_lock);
+               if (likely(page)) {
+                       WARN_ON_ONCE(!PageWriteback(page));
+                       __dec_wb_stat(old_wb, WB_WRITEBACK);
+                       __inc_wb_stat(new_wb, WB_WRITEBACK);
+               }
+       }
+
+       wb_get(new_wb);
+
+       /*
+        * Transfer to @new_wb's IO list if necessary.  The specific list
+        * @inode was on is ignored and the inode is put on ->b_dirty which
+        * is always correct including from ->b_dirty_time.  The transfer
+        * preserves @inode->dirtied_when ordering.
+        */
+       if (!list_empty(&inode->i_wb_list)) {
+               struct inode *pos;
+
+               inode_wb_list_del_locked(inode, old_wb);
+               inode->i_wb = new_wb;
+               list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list)
+                       if (time_after_eq(inode->dirtied_when,
+                                         pos->dirtied_when))
+                               break;
+               inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev);
+       } else {
+               inode->i_wb = new_wb;
+       }
+
+       /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */
+       inode->i_wb_frn_winner = 0;
+       inode->i_wb_frn_avg_time = 0;
+       inode->i_wb_frn_history = 0;
+       switched = true;
+skip_switch:
+       /*
+        * Paired with load_acquire in unlocked_inode_to_wb_begin() and
+        * ensures that the new wb is visible if they see !I_WB_SWITCH.
+        */
+       smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH);
+
+       spin_unlock_irq(&mapping->tree_lock);
+       spin_unlock(&inode->i_lock);
+       spin_unlock(&new_wb->list_lock);
+       spin_unlock(&old_wb->list_lock);
+
+       if (switched) {
+               wb_wakeup(new_wb);
+               wb_put(old_wb);
+       }
+       wb_put(new_wb);
+
+       iput(inode);
+       kfree(isw);
+}
+
+static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
+{
+       struct inode_switch_wbs_context *isw = container_of(rcu_head,
+                               struct inode_switch_wbs_context, rcu_head);
+
+       /* needs to grab bh-unsafe locks, bounce to work item */
+       INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
+       schedule_work(&isw->work);
+}
+
+/**
+ * inode_switch_wbs - change the wb association of an inode
+ * @inode: target inode
+ * @new_wb_id: ID of the new wb
+ *
+ * Switch @inode's wb association to the wb identified by @new_wb_id.  The
+ * switching is performed asynchronously and may fail silently.
+ */
+static void inode_switch_wbs(struct inode *inode, int new_wb_id)
+{
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct cgroup_subsys_state *memcg_css;
+       struct inode_switch_wbs_context *isw;
+
+       /* noop if seems to be already in progress */
+       if (inode->i_state & I_WB_SWITCH)
+               return;
+
+       isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
+       if (!isw)
+               return;
+
+       /* find and pin the new wb */
+       rcu_read_lock();
+       memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys);
+       if (memcg_css)
+               isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC);
+       rcu_read_unlock();
+       if (!isw->new_wb)
+               goto out_free;
+
+       /* while holding I_WB_SWITCH, no one else can update the association */
+       spin_lock(&inode->i_lock);
+       if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+           inode_to_wb(inode) == isw->new_wb) {
+               spin_unlock(&inode->i_lock);
+               goto out_free;
+       }
+       inode->i_state |= I_WB_SWITCH;
+       spin_unlock(&inode->i_lock);
+
+       ihold(inode);
+       isw->inode = inode;
+
+       /*
+        * In addition to synchronizing among switchers, I_WB_SWITCH tells
+        * the RCU protected stat update paths to grab the mapping's
+        * tree_lock so that stat transfer can synchronize against them.
+        * Let's continue after I_WB_SWITCH is guaranteed to be visible.
+        */
+       call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
+       return;
+
+out_free:
+       if (isw->new_wb)
+               wb_put(isw->new_wb);
+       kfree(isw);
+}
+
+/**
+ * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * @inode is locked and about to be written back under the control of @wbc.
+ * Record @inode's writeback context into @wbc and unlock the i_lock.  On
+ * writeback completion, wbc_detach_inode() should be called.  This is used
+ * to track the cgroup writeback context.
+ */
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+                                struct inode *inode)
+{
+       if (!inode_cgwb_enabled(inode)) {
+               spin_unlock(&inode->i_lock);
+               return;
+       }
+
+       wbc->wb = inode_to_wb(inode);
+       wbc->inode = inode;
+
+       wbc->wb_id = wbc->wb->memcg_css->id;
+       wbc->wb_lcand_id = inode->i_wb_frn_winner;
+       wbc->wb_tcand_id = 0;
+       wbc->wb_bytes = 0;
+       wbc->wb_lcand_bytes = 0;
+       wbc->wb_tcand_bytes = 0;
+
+       wb_get(wbc->wb);
+       spin_unlock(&inode->i_lock);
+
+       /*
+        * A dying wb indicates that the memcg-blkcg mapping has changed
+        * and a new wb is already serving the memcg.  Switch immediately.
+        */
+       if (unlikely(wb_dying(wbc->wb)))
+               inode_switch_wbs(inode, wbc->wb_id);
+}
+
+/**
+ * wbc_detach_inode - disassociate wbc from inode and perform foreign detection
+ * @wbc: writeback_control of the just finished writeback
+ *
+ * To be called after a writeback attempt of an inode finishes and undoes
+ * wbc_attach_and_unlock_inode().  Can be called under any context.
+ *
+ * As concurrent write sharing of an inode is expected to be very rare and
+ * memcg only tracks page ownership on first-use basis severely confining
+ * the usefulness of such sharing, cgroup writeback tracks ownership
+ * per-inode.  While the support for concurrent write sharing of an inode
+ * is deemed unnecessary, an inode being written to by different cgroups at
+ * different points in time is a lot more common, and, more importantly,
+ * charging only by first-use can too readily lead to grossly incorrect
+ * behaviors (single foreign page can lead to gigabytes of writeback to be
+ * incorrectly attributed).
+ *
+ * To resolve this issue, cgroup writeback detects the majority dirtier of
+ * an inode and transfers the ownership to it.  To avoid unnnecessary
+ * oscillation, the detection mechanism keeps track of history and gives
+ * out the switch verdict only if the foreign usage pattern is stable over
+ * a certain amount of time and/or writeback attempts.
+ *
+ * On each writeback attempt, @wbc tries to detect the majority writer
+ * using Boyer-Moore majority vote algorithm.  In addition to the byte
+ * count from the majority voting, it also counts the bytes written for the
+ * current wb and the last round's winner wb (max of last round's current
+ * wb, the winner from two rounds ago, and the last round's majority
+ * candidate).  Keeping track of the historical winner helps the algorithm
+ * to semi-reliably detect the most active writer even when it's not the
+ * absolute majority.
+ *
+ * Once the winner of the round is determined, whether the winner is
+ * foreign or not and how much IO time the round consumed is recorded in
+ * inode->i_wb_frn_history.  If the amount of recorded foreign IO time is
+ * over a certain threshold, the switch verdict is given.
+ */
+void wbc_detach_inode(struct writeback_control *wbc)
+{
+       struct bdi_writeback *wb = wbc->wb;
+       struct inode *inode = wbc->inode;
+       unsigned long avg_time, max_bytes, max_time;
+       u16 history;
+       int max_id;
+
+       if (!wb)
+               return;
+
+       history = inode->i_wb_frn_history;
+       avg_time = inode->i_wb_frn_avg_time;
+
+       /* pick the winner of this round */
+       if (wbc->wb_bytes >= wbc->wb_lcand_bytes &&
+           wbc->wb_bytes >= wbc->wb_tcand_bytes) {
+               max_id = wbc->wb_id;
+               max_bytes = wbc->wb_bytes;
+       } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) {
+               max_id = wbc->wb_lcand_id;
+               max_bytes = wbc->wb_lcand_bytes;
+       } else {
+               max_id = wbc->wb_tcand_id;
+               max_bytes = wbc->wb_tcand_bytes;
+       }
+
+       /*
+        * Calculate the amount of IO time the winner consumed and fold it
+        * into the running average kept per inode.  If the consumed IO
+        * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for
+        * deciding whether to switch or not.  This is to prevent one-off
+        * small dirtiers from skewing the verdict.
+        */
+       max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT,
+                               wb->avg_write_bandwidth);
+       if (avg_time)
+               avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) -
+                           (avg_time >> WB_FRN_TIME_AVG_SHIFT);
+       else
+               avg_time = max_time;    /* immediate catch up on first run */
+
+       if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) {
+               int slots;
+
+               /*
+                * The switch verdict is reached if foreign wb's consume
+                * more than a certain proportion of IO time in a
+                * WB_FRN_TIME_PERIOD.  This is loosely tracked by 16 slot
+                * history mask where each bit represents one sixteenth of
+                * the period.  Determine the number of slots to shift into
+                * history from @max_time.
+                */
+               slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT),
+                           (unsigned long)WB_FRN_HIST_MAX_SLOTS);
+               history <<= slots;
+               if (wbc->wb_id != max_id)
+                       history |= (1U << slots) - 1;
+
+               /*
+                * Switch if the current wb isn't the consistent winner.
+                * If there are multiple closely competing dirtiers, the
+                * inode may switch across them repeatedly over time, which
+                * is okay.  The main goal is avoiding keeping an inode on
+                * the wrong wb for an extended period of time.
+                */
+               if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+                       inode_switch_wbs(inode, max_id);
+       }
+
+       /*
+        * Multiple instances of this function may race to update the
+        * following fields but we don't mind occassional inaccuracies.
+        */
+       inode->i_wb_frn_winner = max_id;
+       inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX);
+       inode->i_wb_frn_history = history;
+
+       wb_put(wbc->wb);
+       wbc->wb = NULL;
+}
+
+/**
+ * wbc_account_io - account IO issued during writeback
+ * @wbc: writeback_control of the writeback in progress
+ * @page: page being written out
+ * @bytes: number of bytes being written out
+ *
+ * @bytes from @page are about to written out during the writeback
+ * controlled by @wbc.  Keep the book for foreign inode detection.  See
+ * wbc_detach_inode().
+ */
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+                   size_t bytes)
+{
+       int id;
+
+       /*
+        * pageout() path doesn't attach @wbc to the inode being written
+        * out.  This is intentional as we don't want the function to block
+        * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+        * regular writeback instead of writing things out itself.
+        */
+       if (!wbc->wb)
+               return;
+
+       rcu_read_lock();
+       id = mem_cgroup_css_from_page(page)->id;
+       rcu_read_unlock();
+
+       if (id == wbc->wb_id) {
+               wbc->wb_bytes += bytes;
+               return;
+       }
+
+       if (id == wbc->wb_lcand_id)
+               wbc->wb_lcand_bytes += bytes;
+
+       /* Boyer-Moore majority vote algorithm */
+       if (!wbc->wb_tcand_bytes)
+               wbc->wb_tcand_id = id;
+       if (id == wbc->wb_tcand_id)
+               wbc->wb_tcand_bytes += bytes;
+       else
+               wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes);
 }
 
-static void
-__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                     bool range_cyclic, enum wb_reason reason)
+/**
+ * inode_congested - test whether an inode is congested
+ * @inode: inode to test for congestion
+ * @cong_bits: mask of WB_[a]sync_congested bits to test
+ *
+ * Tests whether @inode is congested.  @cong_bits is the mask of congestion
+ * bits to test and the return value is the mask of set bits.
+ *
+ * If cgroup writeback is enabled for @inode, the congestion state is
+ * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg
+ * associated with @inode is congested; otherwise, the root wb's congestion
+ * state is used.
+ */
+int inode_congested(struct inode *inode, int cong_bits)
+{
+       /*
+        * Once set, ->i_wb never becomes NULL while the inode is alive.
+        * Start transaction iff ->i_wb is visible.
+        */
+       if (inode && inode_to_wb_is_valid(inode)) {
+               struct bdi_writeback *wb;
+               bool locked, congested;
+
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
+               congested = wb_congested(wb, cong_bits);
+               unlocked_inode_to_wb_end(inode, locked);
+               return congested;
+       }
+
+       return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+EXPORT_SYMBOL_GPL(inode_congested);
+
+/**
+ * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work
+ * @bdi: bdi the work item was issued to
+ * @work: work item to wait for
+ *
+ * Wait for the completion of @work which was issued to one of @bdi's
+ * bdi_writeback's.  The caller must have set @work->single_wait before
+ * issuing it.  This wait operates independently fo
+ * wb_wait_for_completion() and also disables automatic freeing of @work.
+ */
+static void wb_wait_for_single_work(struct backing_dev_info *bdi,
+                                   struct wb_writeback_work *work)
+{
+       if (WARN_ON_ONCE(!work->single_wait))
+               return;
+
+       wait_event(bdi->wb_waitq, work->single_done);
+
+       /*
+        * Paired with smp_wmb() in wb_do_writeback() and ensures that all
+        * modifications to @work prior to assertion of ->single_done is
+        * visible to the caller once this function returns.
+        */
+       smp_rmb();
+}
+
+/**
+ * wb_split_bdi_pages - split nr_pages to write according to bandwidth
+ * @wb: target bdi_writeback to split @nr_pages to
+ * @nr_pages: number of pages to write for the whole bdi
+ *
+ * Split @wb's portion of @nr_pages according to @wb's write bandwidth in
+ * relation to the total write bandwidth of all wb's w/ dirty inodes on
+ * @wb->bdi.
+ */
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+       unsigned long this_bw = wb->avg_write_bandwidth;
+       unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+       if (nr_pages == LONG_MAX)
+               return LONG_MAX;
+
+       /*
+        * This may be called on clean wb's and proportional distribution
+        * may not make sense, just use the original @nr_pages in those
+        * cases.  In general, we wanna err on the side of writing more.
+        */
+       if (!tot_bw || this_bw >= tot_bw)
+               return nr_pages;
+       else
+               return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
+}
+
+/**
+ * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb
+ * @wb: target bdi_writeback
+ * @base_work: source wb_writeback_work
+ *
+ * Try to make a clone of @base_work and issue it to @wb.  If cloning
+ * succeeds, %true is returned; otherwise, @base_work is issued directly
+ * and %false is returned.  In the latter case, the caller is required to
+ * wait for @base_work's completion using wb_wait_for_single_work().
+ *
+ * A clone is auto-freed on completion.  @base_work never is.
+ */
+static bool wb_clone_and_queue_work(struct bdi_writeback *wb,
+                                   struct wb_writeback_work *base_work)
 {
        struct wb_writeback_work *work;
 
+       work = kmalloc(sizeof(*work), GFP_ATOMIC);
+       if (work) {
+               *work = *base_work;
+               work->auto_free = 1;
+               work->single_wait = 0;
+       } else {
+               work = base_work;
+               work->auto_free = 0;
+               work->single_wait = 1;
+       }
+       work->single_done = 0;
+       wb_queue_work(wb, work);
+       return work != base_work;
+}
+
+/**
+ * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
+ * @bdi: target backing_dev_info
+ * @base_work: wb_writeback_work to issue
+ * @skip_if_busy: skip wb's which already have writeback in progress
+ *
+ * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which
+ * have dirty inodes.  If @base_work->nr_page isn't %LONG_MAX, it's
+ * distributed to the busy wbs according to each wb's proportion in the
+ * total active write bandwidth of @bdi.
+ */
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+                                 struct wb_writeback_work *base_work,
+                                 bool skip_if_busy)
+{
+       long nr_pages = base_work->nr_pages;
+       int next_blkcg_id = 0;
+       struct bdi_writeback *wb;
+       struct wb_iter iter;
+
+       might_sleep();
+
+       if (!bdi_has_dirty_io(bdi))
+               return;
+restart:
+       rcu_read_lock();
+       bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) {
+               if (!wb_has_dirty_io(wb) ||
+                   (skip_if_busy && writeback_in_progress(wb)))
+                       continue;
+
+               base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages);
+               if (!wb_clone_and_queue_work(wb, base_work)) {
+                       next_blkcg_id = wb->blkcg_css->id + 1;
+                       rcu_read_unlock();
+                       wb_wait_for_single_work(bdi, base_work);
+                       goto restart;
+               }
+       }
+       rcu_read_unlock();
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static struct bdi_writeback *
+locked_inode_to_wb_and_lock_list(struct inode *inode)
+       __releases(&inode->i_lock)
+       __acquires(&wb->list_lock)
+{
+       struct bdi_writeback *wb = inode_to_wb(inode);
+
+       spin_unlock(&inode->i_lock);
+       spin_lock(&wb->list_lock);
+       return wb;
+}
+
+static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode)
+       __acquires(&wb->list_lock)
+{
+       struct bdi_writeback *wb = inode_to_wb(inode);
+
+       spin_lock(&wb->list_lock);
+       return wb;
+}
+
+static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages)
+{
+       return nr_pages;
+}
+
+static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+                                 struct wb_writeback_work *base_work,
+                                 bool skip_if_busy)
+{
+       might_sleep();
+
+       if (bdi_has_dirty_io(bdi) &&
+           (!skip_if_busy || !writeback_in_progress(&bdi->wb))) {
+               base_work->auto_free = 0;
+               base_work->single_wait = 0;
+               base_work->single_done = 0;
+               wb_queue_work(&bdi->wb, base_work);
+       }
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+                       bool range_cyclic, enum wb_reason reason)
+{
+       struct wb_writeback_work *work;
+
+       if (!wb_has_dirty_io(wb))
+               return;
+
        /*
         * This is WB_SYNC_NONE writeback, so if allocation fails just
         * wakeup the thread for old dirty data writeback
         */
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
-               trace_writeback_nowork(bdi);
-               bdi_wakeup_thread(bdi);
+               trace_writeback_nowork(wb->bdi);
+               wb_wakeup(wb);
                return;
        }
 
@@ -155,46 +932,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
        work->nr_pages  = nr_pages;
        work->range_cyclic = range_cyclic;
        work->reason    = reason;
+       work->auto_free = 1;
 
-       bdi_queue_work(bdi, work);
+       wb_queue_work(wb, work);
 }
 
 /**
- * bdi_start_writeback - start writeback
- * @bdi: the backing device to write from
- * @nr_pages: the number of pages to write
- * @reason: reason why some writeback work was initiated
- *
- * Description:
- *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
- *   started when this function returns, we make no guarantees on
- *   completion. Caller need not hold sb s_umount semaphore.
- *
- */
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                       enum wb_reason reason)
-{
-       __bdi_start_writeback(bdi, nr_pages, true, reason);
-}
-
-/**
- * bdi_start_background_writeback - start background writeback
- * @bdi: the backing device to write from
+ * wb_start_background_writeback - start background writeback
+ * @wb: bdi_writback to write from
  *
  * Description:
  *   This makes sure WB_SYNC_NONE background writeback happens. When
- *   this function returns, it is only guaranteed that for given BDI
+ *   this function returns, it is only guaranteed that for given wb
  *   some IO is happening if we are over background dirty threshold.
  *   Caller need not hold sb s_umount semaphore.
  */
-void bdi_start_background_writeback(struct backing_dev_info *bdi)
+void wb_start_background_writeback(struct bdi_writeback *wb)
 {
        /*
         * We just wake up the flusher thread. It will perform background
         * writeback as soon as there is no other work to do.
         */
-       trace_writeback_wake_background(bdi);
-       bdi_wakeup_thread(bdi);
+       trace_writeback_wake_background(wb->bdi);
+       wb_wakeup(wb);
 }
 
 /*
@@ -202,11 +962,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
  */
 void inode_wb_list_del(struct inode *inode)
 {
-       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct bdi_writeback *wb;
 
-       spin_lock(&bdi->wb.list_lock);
-       list_del_init(&inode->i_wb_list);
-       spin_unlock(&bdi->wb.list_lock);
+       wb = inode_to_wb_and_lock_list(inode);
+       inode_wb_list_del_locked(inode, wb);
+       spin_unlock(&wb->list_lock);
 }
 
 /*
@@ -220,7 +980,6 @@ void inode_wb_list_del(struct inode *inode)
  */
 static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 {
-       assert_spin_locked(&wb->list_lock);
        if (!list_empty(&wb->b_dirty)) {
                struct inode *tail;
 
@@ -228,7 +987,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
                if (time_before(inode->dirtied_when, tail->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
-       list_move(&inode->i_wb_list, &wb->b_dirty);
+       inode_wb_list_move_locked(inode, wb, &wb->b_dirty);
 }
 
 /*
@@ -236,8 +995,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
  */
 static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-       assert_spin_locked(&wb->list_lock);
-       list_move(&inode->i_wb_list, &wb->b_more_io);
+       inode_wb_list_move_locked(inode, wb, &wb->b_more_io);
 }
 
 static void inode_sync_complete(struct inode *inode)
@@ -346,6 +1104,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
                                     EXPIRE_DIRTY_ATIME, work);
+       if (moved)
+               wb_io_lists_populated(wb);
        trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -471,10 +1231,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                redirty_tail(inode, wb);
        } else if (inode->i_state & I_DIRTY_TIME) {
                inode->dirtied_when = jiffies;
-               list_move(&inode->i_wb_list, &wb->b_dirty_time);
+               inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time);
        } else {
                /* The inode is clean. Remove from writeback lists. */
-               list_del_init(&inode->i_wb_list);
+               inode_wb_list_del_locked(inode, wb);
        }
 }
 
@@ -605,10 +1365,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
        inode->i_state |= I_SYNC;
-       spin_unlock(&inode->i_lock);
+       wbc_attach_and_unlock_inode(wbc, inode);
 
        ret = __writeback_single_inode(inode, wbc);
 
+       wbc_detach_inode(wbc);
        spin_lock(&wb->list_lock);
        spin_lock(&inode->i_lock);
        /*
@@ -616,7 +1377,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
         * touch it. See comment above for explanation.
         */
        if (!(inode->i_state & I_DIRTY_ALL))
-               list_del_init(&inode->i_wb_list);
+               inode_wb_list_del_locked(inode, wb);
        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
 out:
@@ -624,7 +1385,7 @@ out:
        return ret;
 }
 
-static long writeback_chunk_size(struct backing_dev_info *bdi,
+static long writeback_chunk_size(struct bdi_writeback *wb,
                                 struct wb_writeback_work *work)
 {
        long pages;
@@ -645,8 +1406,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi,
        if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
                pages = LONG_MAX;
        else {
-               pages = min(bdi->avg_write_bandwidth / 2,
-                           global_dirty_limit / DIRTY_SCOPE);
+               pages = min(wb->avg_write_bandwidth / 2,
+                           global_wb_domain.dirty_limit / DIRTY_SCOPE);
                pages = min(pages, work->nr_pages);
                pages = round_down(pages + MIN_WRITEBACK_PAGES,
                                   MIN_WRITEBACK_PAGES);
@@ -741,9 +1502,9 @@ static long writeback_sb_inodes(struct super_block *sb,
                        continue;
                }
                inode->i_state |= I_SYNC;
-               spin_unlock(&inode->i_lock);
+               wbc_attach_and_unlock_inode(&wbc, inode);
 
-               write_chunk = writeback_chunk_size(wb->bdi, work);
+               write_chunk = writeback_chunk_size(wb, work);
                wbc.nr_to_write = write_chunk;
                wbc.pages_skipped = 0;
 
@@ -753,6 +1514,7 @@ static long writeback_sb_inodes(struct super_block *sb,
                 */
                __writeback_single_inode(inode, &wbc);
 
+               wbc_detach_inode(&wbc);
                work->nr_pages -= write_chunk - wbc.nr_to_write;
                wrote += write_chunk - wbc.nr_to_write;
                spin_lock(&wb->list_lock);
@@ -830,33 +1592,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
        return nr_pages - work.nr_pages;
 }
 
-static bool over_bground_thresh(struct backing_dev_info *bdi)
-{
-       unsigned long background_thresh, dirty_thresh;
-
-       global_dirty_limits(&background_thresh, &dirty_thresh);
-
-       if (global_page_state(NR_FILE_DIRTY) +
-           global_page_state(NR_UNSTABLE_NFS) > background_thresh)
-               return true;
-
-       if (bdi_stat(bdi, BDI_RECLAIMABLE) >
-                               bdi_dirty_limit(bdi, background_thresh))
-               return true;
-
-       return false;
-}
-
-/*
- * Called under wb->list_lock. If there are multiple wb per bdi,
- * only the flusher working on the first wb should do it.
- */
-static void wb_update_bandwidth(struct bdi_writeback *wb,
-                               unsigned long start_time)
-{
-       __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time);
-}
-
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
@@ -899,14 +1634,14 @@ static long wb_writeback(struct bdi_writeback *wb,
                 * after the other works are all done.
                 */
                if ((work->for_background || work->for_kupdate) &&
-                   !list_empty(&wb->bdi->work_list))
+                   !list_empty(&wb->work_list))
                        break;
 
                /*
                 * For background writeout, stop when we are below the
                 * background dirty threshold
                 */
-               if (work->for_background && !over_bground_thresh(wb->bdi))
+               if (work->for_background && !wb_over_bg_thresh(wb))
                        break;
 
                /*
@@ -970,18 +1705,17 @@ static long wb_writeback(struct bdi_writeback *wb,
 /*
  * Return the next wb_writeback_work struct that hasn't been processed yet.
  */
-static struct wb_writeback_work *
-get_next_work_item(struct backing_dev_info *bdi)
+static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb)
 {
        struct wb_writeback_work *work = NULL;
 
-       spin_lock_bh(&bdi->wb_lock);
-       if (!list_empty(&bdi->work_list)) {
-               work = list_entry(bdi->work_list.next,
+       spin_lock_bh(&wb->work_lock);
+       if (!list_empty(&wb->work_list)) {
+               work = list_entry(wb->work_list.next,
                                  struct wb_writeback_work, list);
                list_del_init(&work->list);
        }
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_unlock_bh(&wb->work_lock);
        return work;
 }
 
@@ -998,7 +1732,7 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-       if (over_bground_thresh(wb->bdi)) {
+       if (wb_over_bg_thresh(wb)) {
 
                struct wb_writeback_work work = {
                        .nr_pages       = LONG_MAX,
@@ -1053,25 +1787,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb)
  */
 static long wb_do_writeback(struct bdi_writeback *wb)
 {
-       struct backing_dev_info *bdi = wb->bdi;
        struct wb_writeback_work *work;
        long wrote = 0;
 
-       set_bit(BDI_writeback_running, &wb->bdi->state);
-       while ((work = get_next_work_item(bdi)) != NULL) {
+       set_bit(WB_writeback_running, &wb->state);
+       while ((work = get_next_work_item(wb)) != NULL) {
+               struct wb_completion *done = work->done;
+               bool need_wake_up = false;
 
-               trace_writeback_exec(bdi, work);
+               trace_writeback_exec(wb->bdi, work);
 
                wrote += wb_writeback(wb, work);
 
-               /*
-                * Notify the caller of completion if this is a synchronous
-                * work item, otherwise just free it.
-                */
-               if (work->done)
-                       complete(work->done);
-               else
+               if (work->single_wait) {
+                       WARN_ON_ONCE(work->auto_free);
+                       /* paired w/ rmb in wb_wait_for_single_work() */
+                       smp_wmb();
+                       work->single_done = 1;
+                       need_wake_up = true;
+               } else if (work->auto_free) {
                        kfree(work);
+               }
+
+               if (done && atomic_dec_and_test(&done->cnt))
+                       need_wake_up = true;
+
+               if (need_wake_up)
+                       wake_up_all(&wb->bdi->wb_waitq);
        }
 
        /*
@@ -1079,7 +1821,7 @@ static long wb_do_writeback(struct bdi_writeback *wb)
         */
        wrote += wb_check_old_data_flush(wb);
        wrote += wb_check_background_flush(wb);
-       clear_bit(BDI_writeback_running, &wb->bdi->state);
+       clear_bit(WB_writeback_running, &wb->state);
 
        return wrote;
 }
@@ -1088,43 +1830,42 @@ static long wb_do_writeback(struct bdi_writeback *wb)
  * Handle writeback of dirty data for the device backed by this bdi. Also
  * reschedules periodically and does kupdated style flushing.
  */
-void bdi_writeback_workfn(struct work_struct *work)
+void wb_workfn(struct work_struct *work)
 {
        struct bdi_writeback *wb = container_of(to_delayed_work(work),
                                                struct bdi_writeback, dwork);
-       struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
 
-       set_worker_desc("flush-%s", dev_name(bdi->dev));
+       set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
        current->flags |= PF_SWAPWRITE;
 
        if (likely(!current_is_workqueue_rescuer() ||
-                  !test_bit(BDI_registered, &bdi->state))) {
+                  !test_bit(WB_registered, &wb->state))) {
                /*
-                * The normal path.  Keep writing back @bdi until its
+                * The normal path.  Keep writing back @wb until its
                 * work_list is empty.  Note that this path is also taken
-                * if @bdi is shutting down even when we're running off the
+                * if @wb is shutting down even when we're running off the
                 * rescuer as work_list needs to be drained.
                 */
                do {
                        pages_written = wb_do_writeback(wb);
                        trace_writeback_pages_written(pages_written);
-               } while (!list_empty(&bdi->work_list));
+               } while (!list_empty(&wb->work_list));
        } else {
                /*
                 * bdi_wq can't get enough workers and we're running off
                 * the emergency worker.  Don't hog it.  Hopefully, 1024 is
                 * enough for efficient IO.
                 */
-               pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+               pages_written = writeback_inodes_wb(wb, 1024,
                                                    WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
        }
 
-       if (!list_empty(&bdi->work_list))
+       if (!list_empty(&wb->work_list))
                mod_delayed_work(bdi_wq, &wb->dwork, 0);
        else if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-               bdi_wakeup_thread_delayed(bdi);
+               wb_wakeup_delayed(wb);
 
        current->flags &= ~PF_SWAPWRITE;
 }
@@ -1142,9 +1883,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+               struct bdi_writeback *wb;
+               struct wb_iter iter;
+
                if (!bdi_has_dirty_io(bdi))
                        continue;
-               __bdi_start_writeback(bdi, nr_pages, false, reason);
+
+               bdi_for_each_wb(wb, bdi, &iter, 0)
+                       wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
+                                          false, reason);
        }
        rcu_read_unlock();
 }
@@ -1173,9 +1920,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w)
 
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
-               if (list_empty(&bdi->wb.b_dirty_time))
-                       continue;
-               bdi_wakeup_thread(bdi);
+               struct bdi_writeback *wb;
+               struct wb_iter iter;
+
+               bdi_for_each_wb(wb, bdi, &iter, 0)
+                       if (!list_empty(&bdi->wb.b_dirty_time))
+                               wb_wakeup(&bdi->wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
@@ -1249,7 +1999,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
        struct super_block *sb = inode->i_sb;
-       struct backing_dev_info *bdi = NULL;
        int dirtytime;
 
        trace_writeback_mark_inode_dirty(inode, flags);
@@ -1289,6 +2038,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;
 
+               inode_attach_wb(inode, NULL);
+
                if (flags & I_DIRTY_INODE)
                        inode->i_state &= ~I_DIRTY_TIME;
                inode->i_state |= flags;
@@ -1317,38 +2068,39 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                 * reposition it (that would break b_dirty time-ordering).
                 */
                if (!was_dirty) {
+                       struct bdi_writeback *wb;
+                       struct list_head *dirty_list;
                        bool wakeup_bdi = false;
-                       bdi = inode_to_bdi(inode);
 
-                       spin_unlock(&inode->i_lock);
-                       spin_lock(&bdi->wb.list_lock);
-                       if (bdi_cap_writeback_dirty(bdi)) {
-                               WARN(!test_bit(BDI_registered, &bdi->state),
-                                    "bdi-%s not registered\n", bdi->name);
+                       wb = locked_inode_to_wb_and_lock_list(inode);
 
-                               /*
-                                * If this is the first dirty inode for this
-                                * bdi, we have to wake-up the corresponding
-                                * bdi thread to make sure background
-                                * write-back happens later.
-                                */
-                               if (!wb_has_dirty_io(&bdi->wb))
-                                       wakeup_bdi = true;
-                       }
+                       WARN(bdi_cap_writeback_dirty(wb->bdi) &&
+                            !test_bit(WB_registered, &wb->state),
+                            "bdi-%s not registered\n", wb->bdi->name);
 
                        inode->dirtied_when = jiffies;
                        if (dirtytime)
                                inode->dirtied_time_when = jiffies;
+
                        if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
-                               list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+                               dirty_list = &wb->b_dirty;
                        else
-                               list_move(&inode->i_wb_list,
-                                         &bdi->wb.b_dirty_time);
-                       spin_unlock(&bdi->wb.list_lock);
+                               dirty_list = &wb->b_dirty_time;
+
+                       wakeup_bdi = inode_wb_list_move_locked(inode, wb,
+                                                              dirty_list);
+
+                       spin_unlock(&wb->list_lock);
                        trace_writeback_dirty_inode_enqueue(inode);
 
-                       if (wakeup_bdi)
-                               bdi_wakeup_thread_delayed(bdi);
+                       /*
+                        * If this is the first dirty inode for this bdi,
+                        * we have to wake-up the corresponding bdi thread
+                        * to make sure background write-back happens
+                        * later.
+                        */
+                       if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi)
+                               wb_wakeup_delayed(wb);
                        return;
                }
        }
@@ -1411,6 +2163,28 @@ static void wait_sb_inodes(struct super_block *sb)
        iput(old_inode);
 }
 
+static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+                                    enum wb_reason reason, bool skip_if_busy)
+{
+       DEFINE_WB_COMPLETION_ONSTACK(done);
+       struct wb_writeback_work work = {
+               .sb                     = sb,
+               .sync_mode              = WB_SYNC_NONE,
+               .tagged_writepages      = 1,
+               .done                   = &done,
+               .nr_pages               = nr,
+               .reason                 = reason,
+       };
+       struct backing_dev_info *bdi = sb->s_bdi;
+
+       if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
+               return;
+       WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+       bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+       wb_wait_for_completion(bdi, &done);
+}
+
 /**
  * writeback_inodes_sb_nr -    writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1425,21 +2199,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
                            unsigned long nr,
                            enum wb_reason reason)
 {
-       DECLARE_COMPLETION_ONSTACK(done);
-       struct wb_writeback_work work = {
-               .sb                     = sb,
-               .sync_mode              = WB_SYNC_NONE,
-               .tagged_writepages      = 1,
-               .done                   = &done,
-               .nr_pages               = nr,
-               .reason                 = reason,
-       };
-
-       if (sb->s_bdi == &noop_backing_dev_info)
-               return;
-       WARN_ON(!rwsem_is_locked(&sb->s_umount));
-       bdi_queue_work(sb->s_bdi, &work);
-       wait_for_completion(&done);
+       __writeback_inodes_sb_nr(sb, nr, reason, false);
 }
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
@@ -1467,19 +2227,15 @@ EXPORT_SYMBOL(writeback_inodes_sb);
  * Invoke writeback_inodes_sb_nr if no writeback is currently underway.
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb_nr(struct super_block *sb,
-                                 unsigned long nr,
-                                 enum wb_reason reason)
+bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr,
+                                  enum wb_reason reason)
 {
-       if (writeback_in_progress(sb->s_bdi))
-               return 1;
-
        if (!down_read_trylock(&sb->s_umount))
-               return 0;
+               return false;
 
-       writeback_inodes_sb_nr(sb, nr, reason);
+       __writeback_inodes_sb_nr(sb, nr, reason, true);
        up_read(&sb->s_umount);
-       return 1;
+       return true;
 }
 EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
 
@@ -1491,7 +2247,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr);
  * Implement by try_to_writeback_inodes_sb_nr()
  * Returns 1 if writeback was started, 0 if not.
  */
-int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
+bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason)
 {
        return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason);
 }
@@ -1506,7 +2262,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  */
 void sync_inodes_sb(struct super_block *sb)
 {
-       DECLARE_COMPLETION_ONSTACK(done);
+       DEFINE_WB_COMPLETION_ONSTACK(done);
        struct wb_writeback_work work = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_ALL,
@@ -1516,14 +2272,15 @@ void sync_inodes_sb(struct super_block *sb)
                .reason         = WB_REASON_SYNC,
                .for_sync       = 1,
        };
+       struct backing_dev_info *bdi = sb->s_bdi;
 
        /* Nothing to do? */
-       if (sb->s_bdi == &noop_backing_dev_info)
+       if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info)
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-       bdi_queue_work(sb->s_bdi, &work);
-       wait_for_completion(&done);
+       bdi_split_work_to_wbs(bdi, &work, false);
+       wb_wait_for_completion(bdi, &done);
 
        wait_sb_inodes(sb);
 }
index 5ef05b5c4cff86e9353a0594f0faadc1ab8612f5..8c5e2fa68835a07216d250dc9536aafaa1063dc2 100644 (file)
@@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 
        list_del(&req->writepages_entry);
        for (i = 0; i < req->num_pages; i++) {
-               dec_bdi_stat(bdi, BDI_WRITEBACK);
+               dec_wb_stat(&bdi->wb, WB_WRITEBACK);
                dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
-               bdi_writeout_inc(bdi);
+               wb_writeout_inc(&bdi->wb);
        }
        wake_up(&fi->page_waitq);
 }
@@ -1634,7 +1634,7 @@ static int fuse_writepage_locked(struct page *page)
        req->end = fuse_writepage_end;
        req->inode = inode;
 
-       inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+       inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
        spin_lock(&fc->lock);
@@ -1749,9 +1749,9 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
                copy_highpage(old_req->pages[0], page);
                spin_unlock(&fc->lock);
 
-               dec_bdi_stat(bdi, BDI_WRITEBACK);
+               dec_wb_stat(&bdi->wb, WB_WRITEBACK);
                dec_zone_page_state(page, NR_WRITEBACK_TEMP);
-               bdi_writeout_inc(bdi);
+               wb_writeout_inc(&bdi->wb);
                fuse_writepage_free(fc, new_req);
                fuse_request_free(new_req);
                goto out;
@@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page,
        req->page_descs[req->num_pages].offset = 0;
        req->page_descs[req->num_pages].length = PAGE_SIZE;
 
-       inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+       inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
 
        err = 0;
index 859c6edbf81a07cc0bfa5f688928301ac4f676ca..2982445947e174a5bd0f7e6ebaa3292c1302c7ac 100644 (file)
@@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 
        if (wbc->sync_mode == WB_SYNC_ALL)
                gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
-       if (bdi->dirty_exceeded)
+       if (bdi->wb.dirty_exceeded)
                gfs2_ail1_flush(sdp, wbc);
        else
                filemap_fdatawrite(metamapping);
index eee7206c38d18e1a3d3b1697fd429763e3fe9f5b..55c03b9e90708e1230210c271779350ca3c72cc7 100644 (file)
@@ -14,6 +14,7 @@
 
 #include <linux/module.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/nls.h>
index 593af2fdcc2dafec8574caa59d970365aede5239..7302d96ae8bfd742fe9f360f70ccd5d1b5df085d 100644 (file)
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/vfs.h>
index e8d62688ed9181e511e2a0e8c6a5f36840cdbe94..069721f0cc0e0b733bb659fb0d7836cd71499690 100644 (file)
@@ -224,6 +224,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
+       inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode->i_flctx);
index 3e79220babac28383c5c68cdb3580378d3324bcc..ca0244b69de8bad8939ebb904e8d90156a81773e 100644 (file)
@@ -605,6 +605,8 @@ alloc_new:
                                bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH);
                if (bio == NULL)
                        goto confused;
+
+               wbc_init_bio(wbc, bio);
        }
 
        /*
@@ -612,6 +614,7 @@ alloc_new:
         * the confused fail path above (OOM) will be very confused when
         * it finds all bh marked clean (i.e. it will not write anything)
         */
+       wbc_account_io(wbc, page, PAGE_SIZE);
        length = first_unmapped << blkbits;
        if (bio_add_page(bio, page, length, 0) < length) {
                bio = mpage_bio_submit(WRITE, bio);
index a46bf6de9ce455a97ef18275efe4906ac10e12ed..b34f2e228601684a7bd9f31d7d8081a8e7623b53 100644 (file)
@@ -32,6 +32,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/backing-dev.h>
 
 #include <linux/sunrpc/metrics.h>
 
index 9e6475bc5ba22be7293c687d78dda1a626d3024b..7e3c4604bea8a6e6e92906b6c2096adb6df5ca3f 100644 (file)
@@ -607,7 +607,7 @@ void nfs_mark_page_unstable(struct page *page)
        struct inode *inode = page_file_mapping(page)->host;
 
        inc_zone_page_state(page, NR_UNSTABLE_NFS);
-       inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+       inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE);
         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 }
 
index dfc19f1575a19d00bee1b0aeef6575e4416a9ef9..e6c262555e08a62aff65ef3baa04e9666e9f18c2 100644 (file)
@@ -853,7 +853,8 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
        dec_zone_page_state(page, NR_UNSTABLE_NFS);
-       dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+       dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+                   WB_RECLAIMABLE);
 }
 
 /* Called holding inode (/cinfo) lock */
index fbfadb289e628ce32decb024bab92792e4ebc995..719f7f4c7a37bd8cfb292fed77756baed9bacb0b 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/falloc.h>
 #include <linux/quotaops.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 
 #include <cluster/masklog.h>
 
index 0111ad0466ed42146f652db0be879b7335ab28df..3e0af317fcc3eb5e9af88b8f03e00de2f0153747 100644 (file)
@@ -21,6 +21,7 @@
 #include "xattr.h"
 #include <linux/init.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/exportfs.h>
 #include <linux/quotaops.h>
index b3bc3e7ae79db3c1542f8e8cf3ac549aa0deed62..098508a93c7b302fe8e6ab65ec7cd753d2515634 100644 (file)
@@ -80,6 +80,7 @@
 #include <linux/stat.h>
 #include <linux/string.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/parser.h>
 #include <linux/buffer_head.h>
index 095f94c2d8b564a46d3c48636fb0532e6b10a658..e5099f26803285af3aa774a897635264911b1458 100644 (file)
@@ -1873,6 +1873,7 @@ xfs_vm_set_page_dirty(
        loff_t                  end_offset;
        loff_t                  offset;
        int                     newly_dirty;
+       struct mem_cgroup       *memcg;
 
        if (unlikely(!mapping))
                return !TestSetPageDirty(page);
@@ -1892,6 +1893,11 @@ xfs_vm_set_page_dirty(
                        offset += 1 << inode->i_blkbits;
                } while (bh != head);
        }
+       /*
+        * Use mem_group_begin_page_stat() to keep PageDirty synchronized with
+        * per-memcg dirty page counters.
+        */
+       memcg = mem_cgroup_begin_page_stat(page);
        newly_dirty = !TestSetPageDirty(page);
        spin_unlock(&mapping->private_lock);
 
@@ -1902,13 +1908,15 @@ xfs_vm_set_page_dirty(
                spin_lock_irqsave(&mapping->tree_lock, flags);
                if (page->mapping) {    /* Race with truncate? */
                        WARN_ON_ONCE(!PageUptodate(page));
-                       account_page_dirtied(page, mapping);
+                       account_page_dirtied(page, mapping, memcg);
                        radix_tree_tag_set(&mapping->page_tree,
                                        page_index(page), PAGECACHE_TAG_DIRTY);
                }
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
-               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        }
+       mem_cgroup_end_page_stat(memcg);
+       if (newly_dirty)
+               __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
        return newly_dirty;
 }
 
index 3b7591224f4a6698d32371a927e70cb2a391f4a9..7c62fca53e2fc36b5c61f37a829e6532b7e210d2 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/dcache.h>
 #include <linux/falloc.h>
 #include <linux/pagevec.h>
+#include <linux/backing-dev.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
new file mode 100644 (file)
index 0000000..a48d90e
--- /dev/null
@@ -0,0 +1,255 @@
+#ifndef __LINUX_BACKING_DEV_DEFS_H
+#define __LINUX_BACKING_DEV_DEFS_H
+
+#include <linux/list.h>
+#include <linux/radix-tree.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/percpu_counter.h>
+#include <linux/percpu-refcount.h>
+#include <linux/flex_proportions.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+struct page;
+struct device;
+struct dentry;
+
+/*
+ * Bits in bdi_writeback.state
+ */
+enum wb_state {
+       WB_registered,          /* bdi_register() was done */
+       WB_writeback_running,   /* Writeback is in progress */
+       WB_has_dirty_io,        /* Dirty inodes on ->b_{dirty|io|more_io} */
+};
+
+enum wb_congested_state {
+       WB_async_congested,     /* The async (write) queue is getting full */
+       WB_sync_congested,      /* The sync queue is getting full */
+};
+
+typedef int (congested_fn)(void *, int);
+
+enum wb_stat_item {
+       WB_RECLAIMABLE,
+       WB_WRITEBACK,
+       WB_DIRTIED,
+       WB_WRITTEN,
+       NR_WB_STAT_ITEMS
+};
+
+#define WB_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
+
+/*
+ * For cgroup writeback, multiple wb's may map to the same blkcg.  Those
+ * wb's can operate mostly independently but should share the congested
+ * state.  To facilitate such sharing, the congested state is tracked using
+ * the following struct which is created on demand, indexed by blkcg ID on
+ * its bdi, and refcounted.
+ */
+struct bdi_writeback_congested {
+       unsigned long state;            /* WB_[a]sync_congested flags */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct backing_dev_info *bdi;   /* the associated bdi */
+       atomic_t refcnt;                /* nr of attached wb's and blkg */
+       int blkcg_id;                   /* ID of the associated blkcg */
+       struct rb_node rb_node;         /* on bdi->cgwb_congestion_tree */
+#endif
+};
+
+/*
+ * Each wb (bdi_writeback) can perform writeback operations, is measured
+ * and throttled, independently.  Without cgroup writeback, each bdi
+ * (bdi_writeback) is served by its embedded bdi->wb.
+ *
+ * On the default hierarchy, blkcg implicitly enables memcg.  This allows
+ * using memcg's page ownership for attributing writeback IOs, and every
+ * memcg - blkcg combination can be served by its own wb by assigning a
+ * dedicated wb to each memcg, which enables isolation across different
+ * cgroups and propagation of IO back pressure down from the IO layer upto
+ * the tasks which are generating the dirty pages to be written back.
+ *
+ * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
+ * refcounted with the number of inodes attached to it, and pins the memcg
+ * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
+ * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
+ * is tested for blkcg after lookup and removed from index on mismatch so
+ * that a new wb for the combination can be created.
+ */
+struct bdi_writeback {
+       struct backing_dev_info *bdi;   /* our parent bdi */
+
+       unsigned long state;            /* Always use atomic bitops on this */
+       unsigned long last_old_flush;   /* last old data flush */
+
+       struct list_head b_dirty;       /* dirty inodes */
+       struct list_head b_io;          /* parked for writeback */
+       struct list_head b_more_io;     /* parked for more writeback */
+       struct list_head b_dirty_time;  /* time stamps are dirty */
+       spinlock_t list_lock;           /* protects the b_* lists */
+
+       struct percpu_counter stat[NR_WB_STAT_ITEMS];
+
+       struct bdi_writeback_congested *congested;
+
+       unsigned long bw_time_stamp;    /* last time write bw is updated */
+       unsigned long dirtied_stamp;
+       unsigned long written_stamp;    /* pages written at bw_time_stamp */
+       unsigned long write_bandwidth;  /* the estimated write bandwidth */
+       unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */
+
+       /*
+        * The base dirty throttle rate, re-calculated on every 200ms.
+        * All the bdi tasks' dirty rate will be curbed under it.
+        * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
+        * in small steps and is much more smooth/stable than the latter.
+        */
+       unsigned long dirty_ratelimit;
+       unsigned long balanced_dirty_ratelimit;
+
+       struct fprop_local_percpu completions;
+       int dirty_exceeded;
+
+       spinlock_t work_lock;           /* protects work_list & dwork scheduling */
+       struct list_head work_list;
+       struct delayed_work dwork;      /* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct percpu_ref refcnt;       /* used only for !root wb's */
+       struct fprop_local_percpu memcg_completions;
+       struct cgroup_subsys_state *memcg_css; /* the associated memcg */
+       struct cgroup_subsys_state *blkcg_css; /* and blkcg */
+       struct list_head memcg_node;    /* anchored at memcg->cgwb_list */
+       struct list_head blkcg_node;    /* anchored at blkcg->cgwb_list */
+
+       union {
+               struct work_struct release_work;
+               struct rcu_head rcu;
+       };
+#endif
+};
+
+struct backing_dev_info {
+       struct list_head bdi_list;
+       unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
+       unsigned int capabilities; /* Device capabilities */
+       congested_fn *congested_fn; /* Function pointer if device is md/dm */
+       void *congested_data;   /* Pointer to aux data for congested func */
+
+       char *name;
+
+       unsigned int min_ratio;
+       unsigned int max_ratio, max_prop_frac;
+
+       /*
+        * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
+        * any dirty wbs, which is depended upon by bdi_has_dirty().
+        */
+       atomic_long_t tot_write_bandwidth;
+
+       struct bdi_writeback wb;  /* the root writeback info for this bdi */
+       struct bdi_writeback_congested wb_congested; /* its congested state */
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
+       struct rb_root cgwb_congested_tree; /* their congested states */
+       atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */
+#endif
+       wait_queue_head_t wb_waitq;
+
+       struct device *dev;
+
+       struct timer_list laptop_mode_wb_timer;
+
+#ifdef CONFIG_DEBUG_FS
+       struct dentry *debug_dir;
+       struct dentry *debug_stats;
+#endif
+};
+
+enum {
+       BLK_RW_ASYNC    = 0,
+       BLK_RW_SYNC     = 1,
+};
+
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync);
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync);
+
+static inline void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+       clear_wb_congested(bdi->wb.congested, sync);
+}
+
+static inline void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+{
+       set_wb_congested(bdi->wb.congested, sync);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/**
+ * wb_tryget - try to increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+       if (wb != &wb->bdi->wb)
+               return percpu_ref_tryget(&wb->refcnt);
+       return true;
+}
+
+/**
+ * wb_get - increment a wb's refcount
+ * @wb: bdi_writeback to get
+ */
+static inline void wb_get(struct bdi_writeback *wb)
+{
+       if (wb != &wb->bdi->wb)
+               percpu_ref_get(&wb->refcnt);
+}
+
+/**
+ * wb_put - decrement a wb's refcount
+ * @wb: bdi_writeback to put
+ */
+static inline void wb_put(struct bdi_writeback *wb)
+{
+       if (wb != &wb->bdi->wb)
+               percpu_ref_put(&wb->refcnt);
+}
+
+/**
+ * wb_dying - is a wb dying?
+ * @wb: bdi_writeback of interest
+ *
+ * Returns whether @wb is unlinked and being drained.
+ */
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+       return percpu_ref_is_dying(&wb->refcnt);
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool wb_tryget(struct bdi_writeback *wb)
+{
+       return true;
+}
+
+static inline void wb_get(struct bdi_writeback *wb)
+{
+}
+
+static inline void wb_put(struct bdi_writeback *wb)
+{
+}
+
+static inline bool wb_dying(struct bdi_writeback *wb)
+{
+       return false;
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+#endif /* __LINUX_BACKING_DEV_DEFS_H */
index d87d8eced06407c59c6d231f9e707bdcc398ce52..0e6d4828a77a358edd3c77ef7d14eecc6f6001b3 100644 (file)
 #ifndef _LINUX_BACKING_DEV_H
 #define _LINUX_BACKING_DEV_H
 
-#include <linux/percpu_counter.h>
-#include <linux/log2.h>
-#include <linux/flex_proportions.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
-#include <linux/timer.h>
+#include <linux/blkdev.h>
 #include <linux/writeback.h>
-#include <linux/atomic.h>
-#include <linux/sysctl.h>
-#include <linux/workqueue.h>
-
-struct page;
-struct device;
-struct dentry;
-
-/*
- * Bits in backing_dev_info.state
- */
-enum bdi_state {
-       BDI_async_congested,    /* The async (write) queue is getting full */
-       BDI_sync_congested,     /* The sync queue is getting full */
-       BDI_registered,         /* bdi_register() was done */
-       BDI_writeback_running,  /* Writeback is in progress */
-};
-
-typedef int (congested_fn)(void *, int);
-
-enum bdi_stat_item {
-       BDI_RECLAIMABLE,
-       BDI_WRITEBACK,
-       BDI_DIRTIED,
-       BDI_WRITTEN,
-       NR_BDI_STAT_ITEMS
-};
-
-#define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
-
-struct bdi_writeback {
-       struct backing_dev_info *bdi;   /* our parent bdi */
-
-       unsigned long last_old_flush;   /* last old data flush */
-
-       struct delayed_work dwork;      /* work item used for writeback */
-       struct list_head b_dirty;       /* dirty inodes */
-       struct list_head b_io;          /* parked for writeback */
-       struct list_head b_more_io;     /* parked for more writeback */
-       struct list_head b_dirty_time;  /* time stamps are dirty */
-       spinlock_t list_lock;           /* protects the b_* lists */
-};
-
-struct backing_dev_info {
-       struct list_head bdi_list;
-       unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
-       unsigned long state;    /* Always use atomic bitops on this */
-       unsigned int capabilities; /* Device capabilities */
-       congested_fn *congested_fn; /* Function pointer if device is md/dm */
-       void *congested_data;   /* Pointer to aux data for congested func */
-
-       char *name;
-
-       struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
-
-       unsigned long bw_time_stamp;    /* last time write bw is updated */
-       unsigned long dirtied_stamp;
-       unsigned long written_stamp;    /* pages written at bw_time_stamp */
-       unsigned long write_bandwidth;  /* the estimated write bandwidth */
-       unsigned long avg_write_bandwidth; /* further smoothed write bw */
-
-       /*
-        * The base dirty throttle rate, re-calculated on every 200ms.
-        * All the bdi tasks' dirty rate will be curbed under it.
-        * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
-        * in small steps and is much more smooth/stable than the latter.
-        */
-       unsigned long dirty_ratelimit;
-       unsigned long balanced_dirty_ratelimit;
-
-       struct fprop_local_percpu completions;
-       int dirty_exceeded;
-
-       unsigned int min_ratio;
-       unsigned int max_ratio, max_prop_frac;
-
-       struct bdi_writeback wb;  /* default writeback info for this bdi */
-       spinlock_t wb_lock;       /* protects work_list & wb.dwork scheduling */
-
-       struct list_head work_list;
-
-       struct device *dev;
-
-       struct timer_list laptop_mode_wb_timer;
-
-#ifdef CONFIG_DEBUG_FS
-       struct dentry *debug_dir;
-       struct dentry *debug_stats;
-#endif
-};
-
-struct backing_dev_info *inode_to_bdi(struct inode *inode);
+#include <linux/blk-cgroup.h>
+#include <linux/backing-dev-defs.h>
 
 int __must_check bdi_init(struct backing_dev_info *bdi);
 void bdi_destroy(struct backing_dev_info *bdi);
@@ -117,97 +24,99 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...);
 int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
-void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                       enum wb_reason reason);
-void bdi_start_background_writeback(struct backing_dev_info *bdi);
-void bdi_writeback_workfn(struct work_struct *work);
-int bdi_has_dirty_io(struct backing_dev_info *bdi);
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+                       bool range_cyclic, enum wb_reason reason);
+void wb_start_background_writeback(struct bdi_writeback *wb);
+void wb_workfn(struct work_struct *work);
+void wb_wakeup_delayed(struct bdi_writeback *wb);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
 
 extern struct workqueue_struct *bdi_wq;
 
-static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
-       return !list_empty(&wb->b_dirty) ||
-              !list_empty(&wb->b_io) ||
-              !list_empty(&wb->b_more_io);
+       return test_bit(WB_has_dirty_io, &wb->state);
+}
+
+static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
+{
+       /*
+        * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
+        * any dirty wbs.  See wb_update_write_bandwidth().
+        */
+       return atomic_long_read(&bdi->tot_write_bandwidth);
 }
 
-static inline void __add_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item, s64 amount)
+static inline void __add_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item, s64 amount)
 {
-       __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+       __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
 }
 
-static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+static inline void __inc_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item)
 {
-       __add_bdi_stat(bdi, item, 1);
+       __add_wb_stat(wb, item, 1);
 }
 
-static inline void inc_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
        unsigned long flags;
 
        local_irq_save(flags);
-       __inc_bdi_stat(bdi, item);
+       __inc_wb_stat(wb, item);
        local_irq_restore(flags);
 }
 
-static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+static inline void __dec_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item)
 {
-       __add_bdi_stat(bdi, item, -1);
+       __add_wb_stat(wb, item, -1);
 }
 
-static inline void dec_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
        unsigned long flags;
 
        local_irq_save(flags);
-       __dec_bdi_stat(bdi, item);
+       __dec_wb_stat(wb, item);
        local_irq_restore(flags);
 }
 
-static inline s64 bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
 {
-       return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+       return percpu_counter_read_positive(&wb->stat[item]);
 }
 
-static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
+                               enum wb_stat_item item)
 {
-       return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+       return percpu_counter_sum_positive(&wb->stat[item]);
 }
 
-static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
 {
        s64 sum;
        unsigned long flags;
 
        local_irq_save(flags);
-       sum = __bdi_stat_sum(bdi, item);
+       sum = __wb_stat_sum(wb, item);
        local_irq_restore(flags);
 
        return sum;
 }
 
-extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+extern void wb_writeout_inc(struct bdi_writeback *wb);
 
 /*
  * maximal error of a stat counter.
  */
-static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
 {
 #ifdef CONFIG_SMP
-       return nr_cpu_ids * BDI_STAT_BATCH;
+       return nr_cpu_ids * WB_STAT_BATCH;
 #else
        return 1;
 #endif
@@ -231,50 +140,57 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
  * BDI_CAP_NO_WRITEBACK:   Don't write pages back
  * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
  * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
+ *
+ * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
  */
 #define BDI_CAP_NO_ACCT_DIRTY  0x00000001
 #define BDI_CAP_NO_WRITEBACK   0x00000002
 #define BDI_CAP_NO_ACCT_WB     0x00000004
 #define BDI_CAP_STABLE_WRITES  0x00000008
 #define BDI_CAP_STRICTLIMIT    0x00000010
+#define BDI_CAP_CGROUP_WRITEBACK 0x00000020
 
 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
        (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
 
 extern struct backing_dev_info noop_backing_dev_info;
 
-int writeback_in_progress(struct backing_dev_info *bdi);
-
-static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+/**
+ * writeback_in_progress - determine whether there is writeback in progress
+ * @wb: bdi_writeback of interest
+ *
+ * Determine whether there is writeback waiting to be handled against a
+ * bdi_writeback.
+ */
+static inline bool writeback_in_progress(struct bdi_writeback *wb)
 {
-       if (bdi->congested_fn)
-               return bdi->congested_fn(bdi->congested_data, bdi_bits);
-       return (bdi->state & bdi_bits);
+       return test_bit(WB_writeback_running, &wb->state);
 }
 
-static inline int bdi_read_congested(struct backing_dev_info *bdi)
+static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
-       return bdi_congested(bdi, 1 << BDI_sync_congested);
-}
+       struct super_block *sb;
 
-static inline int bdi_write_congested(struct backing_dev_info *bdi)
-{
-       return bdi_congested(bdi, 1 << BDI_async_congested);
+       if (!inode)
+               return &noop_backing_dev_info;
+
+       sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+       if (sb_is_blkdev_sb(sb))
+               return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
+       return sb->s_bdi;
 }
 
-static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
 {
-       return bdi_congested(bdi, (1 << BDI_sync_congested) |
-                                 (1 << BDI_async_congested));
-}
+       struct backing_dev_info *bdi = wb->bdi;
 
-enum {
-       BLK_RW_ASYNC    = 0,
-       BLK_RW_SYNC     = 1,
-};
+       if (bdi->congested_fn)
+               return bdi->congested_fn(bdi->congested_data, cong_bits);
+       return wb->congested->state & cong_bits;
+}
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
-void set_bdi_congested(struct backing_dev_info *bdi, int sync);
 long congestion_wait(int sync, long timeout);
 long wait_iff_congested(struct zone *zone, int sync, long timeout);
 int pdflush_proc_obsolete(struct ctl_table *table, int write,
@@ -318,4 +234,333 @@ static inline int bdi_sched_wait(void *word)
        return 0;
 }
 
-#endif         /* _LINUX_BACKING_DEV_H */
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
+void wb_congested_put(struct bdi_writeback_congested *congested);
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+                                   struct cgroup_subsys_state *memcg_css,
+                                   gfp_t gfp);
+void wb_memcg_offline(struct mem_cgroup *memcg);
+void wb_blkcg_offline(struct blkcg *blkcg);
+int inode_congested(struct inode *inode, int cong_bits);
+
+/**
+ * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
+ * @inode: inode of interest
+ *
+ * cgroup writeback requires support from both the bdi and filesystem.
+ * Test whether @inode has both.
+ */
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+       return bdi_cap_account_dirty(bdi) &&
+               (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+               (inode->i_sb->s_iflags & SB_I_CGROUPWB);
+}
+
+/**
+ * wb_find_current - find wb for %current on a bdi
+ * @bdi: bdi of interest
+ *
+ * Find the wb of @bdi which matches both the memcg and blkcg of %current.
+ * Must be called under rcu_read_lock() which protects the returend wb.
+ * NULL if not found.
+ */
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+       struct cgroup_subsys_state *memcg_css;
+       struct bdi_writeback *wb;
+
+       memcg_css = task_css(current, memory_cgrp_id);
+       if (!memcg_css->parent)
+               return &bdi->wb;
+
+       wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+
+       /*
+        * %current's blkcg equals the effective blkcg of its memcg.  No
+        * need to use the relatively expensive cgroup_get_e_css().
+        */
+       if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+               return wb;
+       return NULL;
+}
+
+/**
+ * wb_get_create_current - get or create wb for %current on a bdi
+ * @bdi: bdi of interest
+ * @gfp: allocation mask
+ *
+ * Equivalent to wb_get_create() on %current's memcg.  This function is
+ * called from a relatively hot path and optimizes the common cases using
+ * wb_find_current().
+ */
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+       struct bdi_writeback *wb;
+
+       rcu_read_lock();
+       wb = wb_find_current(bdi);
+       if (wb && unlikely(!wb_tryget(wb)))
+               wb = NULL;
+       rcu_read_unlock();
+
+       if (unlikely(!wb)) {
+               struct cgroup_subsys_state *memcg_css;
+
+               memcg_css = task_get_css(current, memory_cgrp_id);
+               wb = wb_get_create(bdi, memcg_css, gfp);
+               css_put(memcg_css);
+       }
+       return wb;
+}
+
+/**
+ * inode_to_wb_is_valid - test whether an inode has a wb associated
+ * @inode: inode of interest
+ *
+ * Returns %true if @inode has a wb associated.  May be called without any
+ * locking.
+ */
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+       return inode->i_wb;
+}
+
+/**
+ * inode_to_wb - determine the wb of an inode
+ * @inode: inode of interest
+ *
+ * Returns the wb @inode is currently associated with.  The caller must be
+ * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+ * associated wb's list_lock.
+ */
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+#ifdef CONFIG_LOCKDEP
+       WARN_ON_ONCE(debug_locks &&
+                    (!lockdep_is_held(&inode->i_lock) &&
+                     !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+                     !lockdep_is_held(&inode->i_wb->list_lock)));
+#endif
+       return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+ * @inode: target inode
+ * @lockedp: temp bool output param, to be passed to the end function
+ *
+ * The caller wants to access the wb associated with @inode but isn't
+ * holding inode->i_lock, mapping->tree_lock or wb->list_lock.  This
+ * function determines the wb associated with @inode and ensures that the
+ * association doesn't change until the transaction is finished with
+ * unlocked_inode_to_wb_end().
+ *
+ * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+ * afterwards and can't sleep during transaction.  IRQ may or may not be
+ * disabled on return.
+ */
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+       rcu_read_lock();
+
+       /*
+        * Paired with store_release in inode_switch_wb_work_fn() and
+        * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+        */
+       *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+
+       if (unlikely(*lockedp))
+               spin_lock_irq(&inode->i_mapping->tree_lock);
+
+       /*
+        * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+        * inode_to_wb() will bark.  Deref directly.
+        */
+       return inode->i_wb;
+}
+
+/**
+ * unlocked_inode_to_wb_end - end inode wb access transaction
+ * @inode: target inode
+ * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+ */
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+       if (unlikely(locked))
+               spin_unlock_irq(&inode->i_mapping->tree_lock);
+
+       rcu_read_unlock();
+}
+
+struct wb_iter {
+       int                     start_blkcg_id;
+       struct radix_tree_iter  tree_iter;
+       void                    **slot;
+};
+
+static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
+                                                  struct backing_dev_info *bdi)
+{
+       struct radix_tree_iter *titer = &iter->tree_iter;
+
+       WARN_ON_ONCE(!rcu_read_lock_held());
+
+       if (iter->start_blkcg_id >= 0) {
+               iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
+               iter->start_blkcg_id = -1;
+       } else {
+               iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
+       }
+
+       if (!iter->slot)
+               iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
+       if (iter->slot)
+               return *iter->slot;
+       return NULL;
+}
+
+static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
+                                                  struct backing_dev_info *bdi,
+                                                  int start_blkcg_id)
+{
+       iter->start_blkcg_id = start_blkcg_id;
+
+       if (start_blkcg_id)
+               return __wb_iter_next(iter, bdi);
+       else
+               return &bdi->wb;
+}
+
+/**
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * @wb_cur: cursor struct bdi_writeback pointer
+ * @bdi: bdi to walk wb's of
+ * @iter: pointer to struct wb_iter to be used as iteration buffer
+ * @start_blkcg_id: blkcg ID to start iteration from
+ *
+ * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
+ * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * to be used as temp storage during iteration.  rcu_read_lock() must be
+ * held throughout iteration.
+ */
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)             \
+       for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);      \
+            (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static inline bool inode_cgwb_enabled(struct inode *inode)
+{
+       return false;
+}
+
+static inline struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+       return bdi->wb.congested;
+}
+
+static inline void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+}
+
+static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+{
+       return &bdi->wb;
+}
+
+static inline struct bdi_writeback *
+wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+{
+       return &bdi->wb;
+}
+
+static inline bool inode_to_wb_is_valid(struct inode *inode)
+{
+       return true;
+}
+
+static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+{
+       return &inode_to_bdi(inode)->wb;
+}
+
+static inline struct bdi_writeback *
+unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+{
+       return inode_to_wb(inode);
+}
+
+static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+{
+}
+
+static inline void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+}
+
+static inline void wb_blkcg_offline(struct blkcg *blkcg)
+{
+}
+
+struct wb_iter {
+       int             next_id;
+};
+
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)             \
+       for ((iter)->next_id = (start_blkcg_id);                        \
+            ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
+
+static inline int inode_congested(struct inode *inode, int cong_bits)
+{
+       return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
+static inline int inode_read_congested(struct inode *inode)
+{
+       return inode_congested(inode, 1 << WB_sync_congested);
+}
+
+static inline int inode_write_congested(struct inode *inode)
+{
+       return inode_congested(inode, 1 << WB_async_congested);
+}
+
+static inline int inode_rw_congested(struct inode *inode)
+{
+       return inode_congested(inode, (1 << WB_sync_congested) |
+                                     (1 << WB_async_congested));
+}
+
+static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+{
+       return wb_congested(&bdi->wb, cong_bits);
+}
+
+static inline int bdi_read_congested(struct backing_dev_info *bdi)
+{
+       return bdi_congested(bdi, 1 << WB_sync_congested);
+}
+
+static inline int bdi_write_congested(struct backing_dev_info *bdi)
+{
+       return bdi_congested(bdi, 1 << WB_async_congested);
+}
+
+static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+{
+       return bdi_congested(bdi, (1 << WB_sync_congested) |
+                                 (1 << WB_async_congested));
+}
+
+#endif /* _LINUX_BACKING_DEV_H */
index f0291cf64cc5f6d26a2e60ee5e1cfc02431556d5..5e963a6d7c1427e90a8c2d10f357271b8f6ef33b 100644 (file)
@@ -482,9 +482,12 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #ifdef CONFIG_BLK_CGROUP
+int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css);
 int bio_associate_current(struct bio *bio);
 void bio_disassociate_task(struct bio *bio);
 #else  /* CONFIG_BLK_CGROUP */
+static inline int bio_associate_blkcg(struct bio *bio,
+                       struct cgroup_subsys_state *blkcg_css) { return 0; }
 static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 #endif /* CONFIG_BLK_CGROUP */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
new file mode 100644 (file)
index 0000000..58cfab8
--- /dev/null
@@ -0,0 +1,655 @@
+#ifndef _BLK_CGROUP_H
+#define _BLK_CGROUP_H
+/*
+ * Common Block IO controller cgroup interface
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *                   Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ *                   Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/cgroup.h>
+#include <linux/u64_stats_sync.h>
+#include <linux/seq_file.h>
+#include <linux/radix-tree.h>
+#include <linux/blkdev.h>
+#include <linux/atomic.h>
+
+/* Max limits for throttle policy */
+#define THROTL_IOPS_MAX                UINT_MAX
+
+#ifdef CONFIG_BLK_CGROUP
+
+enum blkg_rwstat_type {
+       BLKG_RWSTAT_READ,
+       BLKG_RWSTAT_WRITE,
+       BLKG_RWSTAT_SYNC,
+       BLKG_RWSTAT_ASYNC,
+
+       BLKG_RWSTAT_NR,
+       BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+};
+
+struct blkcg_gq;
+
+struct blkcg {
+       struct cgroup_subsys_state      css;
+       spinlock_t                      lock;
+
+       struct radix_tree_root          blkg_tree;
+       struct blkcg_gq                 *blkg_hint;
+       struct hlist_head               blkg_list;
+
+       struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head                cgwb_list;
+#endif
+};
+
+struct blkg_stat {
+       struct u64_stats_sync           syncp;
+       uint64_t                        cnt;
+};
+
+struct blkg_rwstat {
+       struct u64_stats_sync           syncp;
+       uint64_t                        cnt[BLKG_RWSTAT_NR];
+};
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q).  This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each has its private
+ * data on each blkg, the size of which is determined by
+ * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+ * together with blkg and invokes pd_init/exit_fn() methods.
+ *
+ * Such private data must embed struct blkg_policy_data (pd) at the
+ * beginning and pd_size can't be smaller than pd.
+ */
+struct blkg_policy_data {
+       /* the blkg and policy id this per-policy data belongs to */
+       struct blkcg_gq                 *blkg;
+       int                             plid;
+
+       /* used during policy activation */
+       struct list_head                alloc_node;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent
+ * from any request_queue associated to it must specify its size
+ * with the cpd_size field of the blkcg_policy structure and
+ * embed a blkcg_policy_data in it. blkcg core allocates
+ * policy-specific per-blkcg structures lazily the first time
+ * they are actually needed, so it handles them together with
+ * blkgs. cpd_init() is invoked to let each policy handle
+ * per-blkcg data.
+ */
+struct blkcg_policy_data {
+       /* the policy id this per-policy data belongs to */
+       int                             plid;
+
+       /* used during policy activation */
+       struct list_head                alloc_node;
+};
+
+/* association between a blk cgroup and a request queue */
+struct blkcg_gq {
+       /* Pointer to the associated request_queue */
+       struct request_queue            *q;
+       struct list_head                q_node;
+       struct hlist_node               blkcg_node;
+       struct blkcg                    *blkcg;
+
+       /*
+        * Each blkg gets congested separately and the congestion state is
+        * propagated to the matching bdi_writeback_congested.
+        */
+       struct bdi_writeback_congested  *wb_congested;
+
+       /* all non-root blkcg_gq's are guaranteed to have access to parent */
+       struct blkcg_gq                 *parent;
+
+       /* request allocation list for this blkcg-q pair */
+       struct request_list             rl;
+
+       /* reference count */
+       atomic_t                        refcnt;
+
+       /* is this blkg online? protected by both blkcg and q locks */
+       bool                            online;
+
+       struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
+
+       struct rcu_head                 rcu_head;
+};
+
+typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+
+struct blkcg_policy {
+       int                             plid;
+       /* policy specific private data size */
+       size_t                          pd_size;
+       /* policy specific per-blkcg data size */
+       size_t                          cpd_size;
+       /* cgroup files for the policy */
+       struct cftype                   *cftypes;
+
+       /* operations */
+       blkcg_pol_init_cpd_fn           *cpd_init_fn;
+       blkcg_pol_init_pd_fn            *pd_init_fn;
+       blkcg_pol_online_pd_fn          *pd_online_fn;
+       blkcg_pol_offline_pd_fn         *pd_offline_fn;
+       blkcg_pol_exit_pd_fn            *pd_exit_fn;
+       blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern struct cgroup_subsys_state * const blkcg_root_css;
+
+struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+                                   struct request_queue *q);
+int blkcg_init_queue(struct request_queue *q);
+void blkcg_drain_queue(struct request_queue *q);
+void blkcg_exit_queue(struct request_queue *q);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct request_queue *q,
+                         const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct request_queue *q,
+                            const struct blkcg_policy *pol);
+
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+                      u64 (*prfill)(struct seq_file *,
+                                    struct blkg_policy_data *, int),
+                      const struct blkcg_policy *pol, int data,
+                      bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+                        const struct blkg_rwstat *rwstat);
+u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+                      int off);
+
+u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+                                            int off);
+
+struct blkg_conf_ctx {
+       struct gendisk                  *disk;
+       struct blkcg_gq                 *blkg;
+       u64                             v;
+};
+
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+                  const char *input, struct blkg_conf_ctx *ctx);
+void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+
+
+static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+{
+       return css ? container_of(css, struct blkcg, css) : NULL;
+}
+
+static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+{
+       return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+}
+
+static inline struct blkcg *bio_blkcg(struct bio *bio)
+{
+       if (bio && bio->bi_css)
+               return css_to_blkcg(bio->bi_css);
+       return task_blkcg(current);
+}
+
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+       return task_get_css(task, blkio_cgrp_id);
+}
+
+/**
+ * blkcg_parent - get the parent of a blkcg
+ * @blkcg: blkcg of interest
+ *
+ * Return the parent blkcg of @blkcg.  Can be called anytime.
+ */
+static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+{
+       return css_to_blkcg(blkcg->css.parent);
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+                                                 struct blkcg_policy *pol)
+{
+       return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+                                                    struct blkcg_policy *pol)
+{
+       return blkcg ? blkcg->pd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data.  Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+       return pd ? pd->blkg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+       char *p;
+
+       p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+       if (!p) {
+               strncpy(buf, "<unavailable>", buflen);
+               return -ENAMETOOLONG;
+       }
+
+       memmove(buf, p, buf + buflen - p);
+       return 0;
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+       atomic_inc(&blkg->refcnt);
+}
+
+void __blkg_release_rcu(struct rcu_head *rcu);
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+       if (atomic_dec_and_test(&blkg->refcnt))
+               call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+}
+
+struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+                              bool update_hint);
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+ * read locked.  If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs.  The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)          \
+       css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)   \
+               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
+                                             (p_blkg)->q, false)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead.  Synchronization rules are the same.  @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)         \
+       css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)  \
+               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
+                                             (p_blkg)->q, false)))
+
+/**
+ * blk_get_rl - get request_list to use
+ * @q: request_queue of interest
+ * @bio: bio which will be attached to the allocated request (may be %NULL)
+ *
+ * The caller wants to allocate a request from @q to use for @bio.  Find
+ * the request_list to use and obtain a reference on it.  Should be called
+ * under queue_lock.  This function is guaranteed to return non-%NULL
+ * request_list.
+ */
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio)
+{
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+
+       rcu_read_lock();
+
+       blkcg = bio_blkcg(bio);
+
+       /* bypass blkg lookup and use @q->root_rl directly for root */
+       if (blkcg == &blkcg_root)
+               goto root_rl;
+
+       /*
+        * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+        * or if either the blkcg or queue is going away.  Fall back to
+        * root_rl in such cases.
+        */
+       blkg = blkg_lookup_create(blkcg, q);
+       if (unlikely(IS_ERR(blkg)))
+               goto root_rl;
+
+       blkg_get(blkg);
+       rcu_read_unlock();
+       return &blkg->rl;
+root_rl:
+       rcu_read_unlock();
+       return &q->root_rl;
+}
+
+/**
+ * blk_put_rl - put request_list
+ * @rl: request_list to put
+ *
+ * Put the reference acquired by blk_get_rl().  Should be called under
+ * queue_lock.
+ */
+static inline void blk_put_rl(struct request_list *rl)
+{
+       /* root_rl may not have blkg set */
+       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+               blkg_put(rl->blkg);
+}
+
+/**
+ * blk_rq_set_rl - associate a request with a request_list
+ * @rq: request of interest
+ * @rl: target request_list
+ *
+ * Associate @rq with @rl so that accounting and freeing can know the
+ * request_list @rq came from.
+ */
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+{
+       rq->rl = rl;
+}
+
+/**
+ * blk_rq_rl - return the request_list a request came from
+ * @rq: request of interest
+ *
+ * Return the request_list @rq is allocated from.
+ */
+static inline struct request_list *blk_rq_rl(struct request *rq)
+{
+       return rq->rl;
+}
+
+struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                        struct request_queue *q);
+/**
+ * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+ *
+ * Should be used under queue_lock.
+ */
+#define blk_queue_for_each_rl(rl, q)   \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+
+static inline void blkg_stat_init(struct blkg_stat *stat)
+{
+       u64_stats_init(&stat->syncp);
+}
+
+/**
+ * blkg_stat_add - add a value to a blkg_stat
+ * @stat: target blkg_stat
+ * @val: value to add
+ *
+ * Add @val to @stat.  The caller is responsible for synchronizing calls to
+ * this function.
+ */
+static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+{
+       u64_stats_update_begin(&stat->syncp);
+       stat->cnt += val;
+       u64_stats_update_end(&stat->syncp);
+}
+
+/**
+ * blkg_stat_read - read the current value of a blkg_stat
+ * @stat: blkg_stat to read
+ *
+ * Read the current value of @stat.  This function can be called without
+ * synchroniztion and takes care of u64 atomicity.
+ */
+static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+{
+       unsigned int start;
+       uint64_t v;
+
+       do {
+               start = u64_stats_fetch_begin_irq(&stat->syncp);
+               v = stat->cnt;
+       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
+
+       return v;
+}
+
+/**
+ * blkg_stat_reset - reset a blkg_stat
+ * @stat: blkg_stat to reset
+ */
+static inline void blkg_stat_reset(struct blkg_stat *stat)
+{
+       stat->cnt = 0;
+}
+
+/**
+ * blkg_stat_merge - merge a blkg_stat into another
+ * @to: the destination blkg_stat
+ * @from: the source
+ *
+ * Add @from's count to @to.
+ */
+static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+{
+       blkg_stat_add(to, blkg_stat_read(from));
+}
+
+static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+{
+       u64_stats_init(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @rw: mask of REQ_{WRITE|SYNC}
+ * @val: value to add
+ *
+ * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+                                  int rw, uint64_t val)
+{
+       u64_stats_update_begin(&rwstat->syncp);
+
+       if (rw & REQ_WRITE)
+               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+       else
+               rwstat->cnt[BLKG_RWSTAT_READ] += val;
+       if (rw & REQ_SYNC)
+               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+       else
+               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+
+       u64_stats_update_end(&rwstat->syncp);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it as the return value.
+ * This function can be called without synchronization and takes care of
+ * u64 atomicity.
+ */
+static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+{
+       unsigned int start;
+       struct blkg_rwstat tmp;
+
+       do {
+               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
+               tmp = *rwstat;
+       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+
+       return tmp;
+}
+
+/**
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction.  This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+{
+       struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+
+       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+}
+
+/**
+ * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's counts to @to.
+ */
+static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+                                    struct blkg_rwstat *from)
+{
+       struct blkg_rwstat v = blkg_rwstat_read(from);
+       int i;
+
+       u64_stats_update_begin(&to->syncp);
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               to->cnt[i] += v.cnt[i];
+       u64_stats_update_end(&to->syncp);
+}
+
+#else  /* CONFIG_BLK_CGROUP */
+
+struct blkcg {
+};
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_gq {
+};
+
+struct blkcg_policy {
+};
+
+#define blkcg_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
+static inline struct cgroup_subsys_state *
+task_get_blkcg_css(struct task_struct *task)
+{
+       return NULL;
+}
+
+#ifdef CONFIG_BLOCK
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+static inline void blkcg_drain_queue(struct request_queue *q) { }
+static inline void blkcg_exit_queue(struct request_queue *q) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct request_queue *q,
+                                       const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct request_queue *q,
+                                          const struct blkcg_policy *pol) { }
+
+static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+                                                 struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+
+static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio) { return &q->root_rl; }
+static inline void blk_put_rl(struct request_list *rl) { }
+static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+
+#define blk_queue_for_each_rl(rl, q)   \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif /* CONFIG_BLOCK */
+#endif /* CONFIG_BLK_CGROUP */
+#endif /* _BLK_CGROUP_H */
index 5ced29cef03f7b01819019e7c34cbbc1b2b549a5..7f2f54b4587f78e17536c9197fe8172b566f8802 100644 (file)
@@ -12,7 +12,7 @@
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
-#include <linux/backing-dev.h>
+#include <linux/backing-dev-defs.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
@@ -787,25 +787,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                         struct scsi_ioctl_command __user *);
 
-/*
- * A queue has just exitted congestion.  Note this in the global counter of
- * congested queues, and wake up anyone who was waiting for requests to be
- * put back.
- */
-static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
-{
-       clear_bdi_congested(&q->backing_dev_info, sync);
-}
-
-/*
- * A queue has just entered congestion.  Flag that in the queue's VM-visible
- * state flags and increment the global gounter of congested queues.
- */
-static inline void blk_set_queue_congested(struct request_queue *q, int sync)
-{
-       set_bdi_congested(&q->backing_dev_info, sync);
-}
-
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
index b9cb94c3102a402539368cee3016d81b1cc173f9..e7da0aa65b2d7a8dcd349271b9ae80e04f98a483 100644 (file)
@@ -773,6 +773,31 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
        return task_css_check(task, subsys_id, false);
 }
 
+/**
+ * task_get_css - find and get the css for (task, subsys)
+ * @task: the target task
+ * @subsys_id: the target subsystem ID
+ *
+ * Find the css for the (@task, @subsys_id) combination, increment a
+ * reference on and return it.  This function is guaranteed to return a
+ * valid css.
+ */
+static inline struct cgroup_subsys_state *
+task_get_css(struct task_struct *task, int subsys_id)
+{
+       struct cgroup_subsys_state *css;
+
+       rcu_read_lock();
+       while (true) {
+               css = task_css(task, subsys_id);
+               if (likely(css_tryget_online(css)))
+                       break;
+               cpu_relax();
+       }
+       rcu_read_unlock();
+       return css;
+}
+
 /**
  * task_css_is_root - test whether a task belongs to the root css
  * @task: the target task
index 5db7b1379d174848116124b5f9c26fc212798d21..e351da4a934f415b4ba0d2cc84acaa52c8120614 100644 (file)
@@ -35,6 +35,7 @@
 #include <uapi/linux/fs.h>
 
 struct backing_dev_info;
+struct bdi_writeback;
 struct export_operations;
 struct hd_geometry;
 struct iovec;
@@ -634,6 +635,14 @@ struct inode {
 
        struct hlist_node       i_hash;
        struct list_head        i_wb_list;      /* backing dev IO list */
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct bdi_writeback    *i_wb;          /* the associated cgroup wb */
+
+       /* foreign inode detection, see wbc_detach_inode() */
+       int                     i_wb_frn_winner;
+       u16                     i_wb_frn_avg_time;
+       u16                     i_wb_frn_history;
+#endif
        struct list_head        i_lru;          /* inode LRU list */
        struct list_head        i_sb_list;
        union {
@@ -1232,6 +1241,8 @@ struct mm_struct;
 #define UMOUNT_NOFOLLOW        0x00000008      /* Don't follow symlink on umount */
 #define UMOUNT_UNUSED  0x80000000      /* Flag guaranteed to be unused */
 
+/* sb->s_iflags */
+#define SB_I_CGROUPWB  0x00000001      /* cgroup-aware writeback enabled */
 
 /* Possible states of 'frozen' field */
 enum {
@@ -1270,6 +1281,7 @@ struct super_block {
        const struct quotactl_ops       *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long           s_flags;
+       unsigned long           s_iflags;       /* internal SB_I_* flags */
        unsigned long           s_magic;
        struct dentry           *s_root;
        struct rw_semaphore     s_umount;
@@ -1806,6 +1818,11 @@ struct super_operations {
  *
  * I_DIO_WAKEUP                Never set.  Only used as a key for wait_on_bit().
  *
+ * I_WB_SWITCH         Cgroup bdi_writeback switching in progress.  Used to
+ *                     synchronize competing switching instances and to tell
+ *                     wb stat updates to grab mapping->tree_lock.  See
+ *                     inode_switch_wb_work_fn() for details.
+ *
  * Q: What is the difference between I_WILL_FREE and I_FREEING?
  */
 #define I_DIRTY_SYNC           (1 << 0)
@@ -1825,6 +1842,7 @@ struct super_operations {
 #define I_DIRTY_TIME           (1 << 11)
 #define __I_DIRTY_TIME_EXPIRED 12
 #define I_DIRTY_TIME_EXPIRED   (1 << __I_DIRTY_TIME_EXPIRED)
+#define I_WB_SWITCH            (1 << 13)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
 #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
@@ -2241,7 +2259,13 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int sb_is_blkdev_sb(struct super_block *sb);
+
+extern struct super_block *blockdev_superblock;
+
+static inline bool sb_is_blkdev_sb(struct super_block *sb)
+{
+       return sb == blockdev_superblock;
+}
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
index 6c8918114804fda89d00ed3e6b1482539f2dd4ee..73b02b0a8f609ac757de6ee59b23bcf8b0e87396 100644 (file)
@@ -41,6 +41,7 @@ enum mem_cgroup_stat_index {
        MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
        MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
+       MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
        MEM_CGROUP_STAT_WRITEBACK,      /* # of pages under writeback */
        MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
@@ -67,6 +68,8 @@ enum mem_cgroup_events_index {
 };
 
 #ifdef CONFIG_MEMCG
+extern struct cgroup_subsys_state *mem_cgroup_root_css;
+
 void mem_cgroup_events(struct mem_cgroup *memcg,
                       enum mem_cgroup_events_index idx,
                       unsigned int nr);
@@ -112,6 +115,7 @@ static inline bool mm_match_cgroup(struct mm_struct *mm,
 }
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
+extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
 
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
@@ -195,6 +199,8 @@ void mem_cgroup_split_huge_fixup(struct page *head);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
+#define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+
 static inline void mem_cgroup_events(struct mem_cgroup *memcg,
                                     enum mem_cgroup_events_index idx,
                                     unsigned int nr)
@@ -382,6 +388,29 @@ enum {
        OVER_LIMIT,
 };
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback);
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+       return NULL;
+}
+
+static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
+                                      unsigned long *pavail,
+                                      unsigned long *pdirty,
+                                      unsigned long *pwriteback)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
 struct sock;
 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
 void sock_update_memcg(struct sock *sk);
index 24ad583596d1219b4ec1111e5aea3045230ee650..99959a34f4f15e6d66b8a6681b256134634164ee 100644 (file)
@@ -27,6 +27,7 @@ struct anon_vma_chain;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct bdi_writeback;
 
 #ifndef CONFIG_NEED_MULTIPLE_NODES     /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1211,10 +1212,13 @@ int __set_page_dirty_nobuffers(struct page *page);
 int __set_page_dirty_no_writeback(struct page *page);
 int redirty_page_for_writepage(struct writeback_control *wbc,
                                struct page *page);
-void account_page_dirtied(struct page *page, struct address_space *mapping);
-void account_page_cleaned(struct page *page, struct address_space *mapping);
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg);
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg, struct bdi_writeback *wb);
 int set_page_dirty(struct page *page);
 int set_page_dirty_lock(struct page *page);
+void cancel_dirty_page(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
 int get_cmdline(struct task_struct *task, char *buffer, int buflen);
index 4b3736f7065c496601011b9474368238f9af923a..fb0814ca65c7328b0bb2bcf9be958dbff2a05c04 100644 (file)
@@ -651,7 +651,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
 int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
                                pgoff_t index, gfp_t gfp_mask);
 extern void delete_from_page_cache(struct page *page);
-extern void __delete_from_page_cache(struct page *page, void *shadow);
+extern void __delete_from_page_cache(struct page *page, void *shadow,
+                                    struct mem_cgroup *memcg);
 int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask);
 
 /*
index b2dd371ec0ca0aa6f0e1dc71d1f79c69c6dbdb2b..b333c945e57117aa3d80f6ec2df47f6513c82260 100644 (file)
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
+#include <linux/flex_proportions.h>
+#include <linux/backing-dev-defs.h>
 
 DECLARE_PER_CPU(int, dirty_throttle_leaks);
 
@@ -84,8 +86,85 @@ struct writeback_control {
        unsigned for_reclaim:1;         /* Invoked from the page allocator */
        unsigned range_cyclic:1;        /* range_start is cyclic */
        unsigned for_sync:1;            /* sync(2) WB_SYNC_ALL writeback */
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct bdi_writeback *wb;       /* wb this writeback is issued under */
+       struct inode *inode;            /* inode being written out */
+
+       /* foreign inode detection, see wbc_detach_inode() */
+       int wb_id;                      /* current wb id */
+       int wb_lcand_id;                /* last foreign candidate wb id */
+       int wb_tcand_id;                /* this foreign candidate wb id */
+       size_t wb_bytes;                /* bytes written by current wb */
+       size_t wb_lcand_bytes;          /* bytes written by last candidate */
+       size_t wb_tcand_bytes;          /* bytes written by this candidate */
+#endif
 };
 
+/*
+ * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
+ * and are measured against each other in.  There always is one global
+ * domain, global_wb_domain, that every wb in the system is a member of.
+ * This allows measuring the relative bandwidth of each wb to distribute
+ * dirtyable memory accordingly.
+ */
+struct wb_domain {
+       spinlock_t lock;
+
+       /*
+        * Scale the writeback cache size proportional to the relative
+        * writeout speed.
+        *
+        * We do this by keeping a floating proportion between BDIs, based
+        * on page writeback completions [end_page_writeback()]. Those
+        * devices that write out pages fastest will get the larger share,
+        * while the slower will get a smaller share.
+        *
+        * We use page writeout completions because we are interested in
+        * getting rid of dirty pages. Having them written out is the
+        * primary goal.
+        *
+        * We introduce a concept of time, a period over which we measure
+        * these events, because demand can/will vary over time. The length
+        * of this period itself is measured in page writeback completions.
+        */
+       struct fprop_global completions;
+       struct timer_list period_timer; /* timer for aging of completions */
+       unsigned long period_time;
+
+       /*
+        * The dirtyable memory and dirty threshold could be suddenly
+        * knocked down by a large amount (eg. on the startup of KVM in a
+        * swapless system). This may throw the system into deep dirty
+        * exceeded state and throttle heavy/light dirtiers alike. To
+        * retain good responsiveness, maintain global_dirty_limit for
+        * tracking slowly down to the knocked down dirty threshold.
+        *
+        * Both fields are protected by ->lock.
+        */
+       unsigned long dirty_limit_tstamp;
+       unsigned long dirty_limit;
+};
+
+/**
+ * wb_domain_size_changed - memory available to a wb_domain has changed
+ * @dom: wb_domain of interest
+ *
+ * This function should be called when the amount of memory available to
+ * @dom has changed.  It resets @dom's dirty limit parameters to prevent
+ * the past values which don't match the current configuration from skewing
+ * dirty throttling.  Without this, when memory size of a wb_domain is
+ * greatly reduced, the dirty throttling logic may allow too many pages to
+ * be dirtied leading to consecutive unnecessary OOMs and may get stuck in
+ * that situation.
+ */
+static inline void wb_domain_size_changed(struct wb_domain *dom)
+{
+       spin_lock(&dom->lock);
+       dom->dirty_limit_tstamp = jiffies;
+       dom->dirty_limit = 0;
+       spin_unlock(&dom->lock);
+}
+
 /*
  * fs/fs-writeback.c
  */    
@@ -93,9 +172,9 @@ struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
                                                        enum wb_reason reason);
-int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
-int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
-                                 enum wb_reason reason);
+bool try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+bool try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
+                                  enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
@@ -107,6 +186,123 @@ static inline void wait_on_inode(struct inode *inode)
        wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/cgroup.h>
+#include <linux/bio.h>
+
+void __inode_attach_wb(struct inode *inode, struct page *page);
+void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+                                struct inode *inode)
+       __releases(&inode->i_lock);
+void wbc_detach_inode(struct writeback_control *wbc);
+void wbc_account_io(struct writeback_control *wbc, struct page *page,
+                   size_t bytes);
+
+/**
+ * inode_attach_wb - associate an inode with its wb
+ * @inode: inode of interest
+ * @page: page being dirtied (may be NULL)
+ *
+ * If @inode doesn't have its wb, associate it with the wb matching the
+ * memcg of @page or, if @page is NULL, %current.  May be called w/ or w/o
+ * @inode->i_lock.
+ */
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+       if (!inode->i_wb)
+               __inode_attach_wb(inode, page);
+}
+
+/**
+ * inode_detach_wb - disassociate an inode from its wb
+ * @inode: inode of interest
+ *
+ * @inode is being freed.  Detach from its wb.
+ */
+static inline void inode_detach_wb(struct inode *inode)
+{
+       if (inode->i_wb) {
+               wb_put(inode->i_wb);
+               inode->i_wb = NULL;
+       }
+}
+
+/**
+ * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite
+ * @wbc: writeback_control of interest
+ * @inode: target inode
+ *
+ * This function is to be used by __filemap_fdatawrite_range(), which is an
+ * alternative entry point into writeback code, and first ensures @inode is
+ * associated with a bdi_writeback and attaches it to @wbc.
+ */
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+                                              struct inode *inode)
+{
+       spin_lock(&inode->i_lock);
+       inode_attach_wb(inode, NULL);
+       wbc_attach_and_unlock_inode(wbc, inode);
+}
+
+/**
+ * wbc_init_bio - writeback specific initializtion of bio
+ * @wbc: writeback_control for the writeback in progress
+ * @bio: bio to be initialized
+ *
+ * @bio is a part of the writeback in progress controlled by @wbc.  Perform
+ * writeback specific initialization.  This is used to apply the cgroup
+ * writeback context.
+ */
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+       /*
+        * pageout() path doesn't attach @wbc to the inode being written
+        * out.  This is intentional as we don't want the function to block
+        * behind a slow cgroup.  Ultimately, we want pageout() to kick off
+        * regular writeback instead of writing things out itself.
+        */
+       if (wbc->wb)
+               bio_associate_blkcg(bio, wbc->wb->blkcg_css);
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static inline void inode_attach_wb(struct inode *inode, struct page *page)
+{
+}
+
+static inline void inode_detach_wb(struct inode *inode)
+{
+}
+
+static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
+                                              struct inode *inode)
+       __releases(&inode->i_lock)
+{
+       spin_unlock(&inode->i_lock);
+}
+
+static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc,
+                                              struct inode *inode)
+{
+}
+
+static inline void wbc_detach_inode(struct writeback_control *wbc)
+{
+}
+
+static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio)
+{
+}
+
+static inline void wbc_account_io(struct writeback_control *wbc,
+                                 struct page *page, size_t bytes)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * mm/page-writeback.c
  */
@@ -120,8 +316,12 @@ static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
 bool zone_dirty_ok(struct zone *zone);
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp);
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom);
+#endif
 
-extern unsigned long global_dirty_limit;
+extern struct wb_domain global_wb_domain;
 
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
@@ -155,19 +355,12 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int,
                                      void __user *, size_t *, loff_t *);
 
 void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
-                              unsigned long dirty);
-
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-                           unsigned long thresh,
-                           unsigned long bg_thresh,
-                           unsigned long dirty,
-                           unsigned long bdi_thresh,
-                           unsigned long bdi_dirty,
-                           unsigned long start_time);
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh);
 
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time);
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited(struct address_space *mapping);
+bool wb_over_bg_thresh(struct bdi_writeback *wb);
 
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                void *data);
index c178d13d6f4c0cb51d441c59e7b4975a1913ed3e..a7aa607a4c55e51ec8ba8a6593828604e31a7aac 100644 (file)
@@ -360,7 +360,7 @@ TRACE_EVENT(global_dirty_state,
                __entry->nr_written     = global_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh   = dirty_thresh;
-               __entry->dirty_limit = global_dirty_limit;
+               __entry->dirty_limit    = global_wb_domain.dirty_limit;
        ),
 
        TP_printk("dirty=%lu writeback=%lu unstable=%lu "
@@ -399,13 +399,13 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 
        TP_fast_assign(
                strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-               __entry->write_bw       = KBps(bdi->write_bandwidth);
-               __entry->avg_write_bw   = KBps(bdi->avg_write_bandwidth);
+               __entry->write_bw       = KBps(bdi->wb.write_bandwidth);
+               __entry->avg_write_bw   = KBps(bdi->wb.avg_write_bandwidth);
                __entry->dirty_rate     = KBps(dirty_rate);
-               __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
+               __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
                __entry->task_ratelimit = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
-                                         KBps(bdi->balanced_dirty_ratelimit);
+                                       KBps(bdi->wb.balanced_dirty_ratelimit);
        ),
 
        TP_printk("bdi %s: "
@@ -462,8 +462,9 @@ TRACE_EVENT(balance_dirty_pages,
                unsigned long freerun = (thresh + bg_thresh) / 2;
                strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
 
-               __entry->limit          = global_dirty_limit;
-               __entry->setpoint       = (global_dirty_limit + freerun) / 2;
+               __entry->limit          = global_wb_domain.dirty_limit;
+               __entry->setpoint       = (global_wb_domain.dirty_limit +
+                                               freerun) / 2;
                __entry->dirty          = dirty;
                __entry->bdi_setpoint   = __entry->setpoint *
                                                bdi_thresh / (thresh + 1);
index b999fa381bf9fe1f37757af5e0a454cc6adb2da9..7260b27ebbabeb4537bc747cc604858e070baf00 100644 (file)
@@ -1127,6 +1127,11 @@ config DEBUG_BLK_CGROUP
        Enable some debugging help. Currently it exports additional stat
        files in a cgroup which can be useful for debugging.
 
+config CGROUP_WRITEBACK
+       bool
+       depends on MEMCG && BLK_CGROUP
+       default y
+
 endif # CGROUPS
 
 config CHECKPOINT_RESTORE
index 000e7b3b9896f2a9479687befd2442c43193614e..7756da31b02bcbb2a7f7036a4bbbdd093883ad6c 100644 (file)
@@ -18,6 +18,7 @@ struct backing_dev_info noop_backing_dev_info = {
        .name           = "noop",
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
+EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 
 static struct class *bdi_class;
 
@@ -48,7 +49,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        struct bdi_writeback *wb = &bdi->wb;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
-       unsigned long bdi_thresh;
+       unsigned long wb_thresh;
        unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
        struct inode *inode;
 
@@ -66,7 +67,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        spin_unlock(&wb->list_lock);
 
        global_dirty_limits(&background_thresh, &dirty_thresh);
-       bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+       wb_thresh = wb_calc_thresh(wb, dirty_thresh);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
@@ -84,19 +85,19 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
                   "b_dirty_time:       %10lu\n"
                   "bdi_list:           %10u\n"
                   "state:              %10lx\n",
-                  (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-                  (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-                  K(bdi_thresh),
+                  (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+                  (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
+                  K(wb_thresh),
                   K(dirty_thresh),
                   K(background_thresh),
-                  (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
-                  (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
-                  (unsigned long) K(bdi->write_bandwidth),
+                  (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+                  (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
+                  (unsigned long) K(wb->write_bandwidth),
                   nr_dirty,
                   nr_io,
                   nr_more_io,
                   nr_dirty_time,
-                  !list_empty(&bdi->bdi_list), bdi->state);
+                  !list_empty(&bdi->bdi_list), bdi->wb.state);
 #undef K
 
        return 0;
@@ -255,13 +256,8 @@ static int __init default_bdi_init(void)
 }
 subsys_initcall(default_bdi_init);
 
-int bdi_has_dirty_io(struct backing_dev_info *bdi)
-{
-       return wb_has_dirty_io(&bdi->wb);
-}
-
 /*
- * This function is used when the first inode for this bdi is marked dirty. It
+ * This function is used when the first inode for this wb is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
  * periodic background write-out of dirty inodes. Since the write-out would
  * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
@@ -274,162 +270,550 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
  * We have to be careful not to postpone flush work if it is scheduled for
  * earlier. Thus we use queue_delayed_work().
  */
-void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+void wb_wakeup_delayed(struct bdi_writeback *wb)
 {
        unsigned long timeout;
 
        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-       spin_lock_bh(&bdi->wb_lock);
-       if (test_bit(BDI_registered, &bdi->state))
-               queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_lock_bh(&wb->work_lock);
+       if (test_bit(WB_registered, &wb->state))
+               queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+       spin_unlock_bh(&wb->work_lock);
 }
 
 /*
- * Remove bdi from bdi_list, and ensure that it is no longer visible
+ * Initial write bandwidth: 100 MB/s
  */
-static void bdi_remove_from_list(struct backing_dev_info *bdi)
-{
-       spin_lock_bh(&bdi_lock);
-       list_del_rcu(&bdi->bdi_list);
-       spin_unlock_bh(&bdi_lock);
-
-       synchronize_rcu_expedited();
-}
+#define INIT_BW                (100 << (20 - PAGE_SHIFT))
 
-int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-               const char *fmt, ...)
+static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+                  gfp_t gfp)
 {
-       va_list args;
-       struct device *dev;
+       int i, err;
 
-       if (bdi->dev)   /* The driver needs to use separate queues per device */
-               return 0;
+       memset(wb, 0, sizeof(*wb));
 
-       va_start(args, fmt);
-       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-       va_end(args);
-       if (IS_ERR(dev))
-               return PTR_ERR(dev);
+       wb->bdi = bdi;
+       wb->last_old_flush = jiffies;
+       INIT_LIST_HEAD(&wb->b_dirty);
+       INIT_LIST_HEAD(&wb->b_io);
+       INIT_LIST_HEAD(&wb->b_more_io);
+       INIT_LIST_HEAD(&wb->b_dirty_time);
+       spin_lock_init(&wb->list_lock);
 
-       bdi->dev = dev;
+       wb->bw_time_stamp = jiffies;
+       wb->balanced_dirty_ratelimit = INIT_BW;
+       wb->dirty_ratelimit = INIT_BW;
+       wb->write_bandwidth = INIT_BW;
+       wb->avg_write_bandwidth = INIT_BW;
 
-       bdi_debug_register(bdi, dev_name(dev));
-       set_bit(BDI_registered, &bdi->state);
+       spin_lock_init(&wb->work_lock);
+       INIT_LIST_HEAD(&wb->work_list);
+       INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
 
-       spin_lock_bh(&bdi_lock);
-       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
-       spin_unlock_bh(&bdi_lock);
+       err = fprop_local_init_percpu(&wb->completions, gfp);
+       if (err)
+               return err;
 
-       trace_writeback_bdi_register(bdi);
-       return 0;
-}
-EXPORT_SYMBOL(bdi_register);
+       for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+               err = percpu_counter_init(&wb->stat[i], 0, gfp);
+               if (err) {
+                       while (--i)
+                               percpu_counter_destroy(&wb->stat[i]);
+                       fprop_local_destroy_percpu(&wb->completions);
+                       return err;
+               }
+       }
 
-int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
-{
-       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+       return 0;
 }
-EXPORT_SYMBOL(bdi_register_dev);
 
 /*
  * Remove bdi from the global list and shutdown any threads we have running
  */
-static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+static void wb_shutdown(struct bdi_writeback *wb)
 {
        /* Make sure nobody queues further work */
-       spin_lock_bh(&bdi->wb_lock);
-       if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
-               spin_unlock_bh(&bdi->wb_lock);
+       spin_lock_bh(&wb->work_lock);
+       if (!test_and_clear_bit(WB_registered, &wb->state)) {
+               spin_unlock_bh(&wb->work_lock);
                return;
        }
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_unlock_bh(&wb->work_lock);
 
        /*
-        * Make sure nobody finds us on the bdi_list anymore
+        * Drain work list and shutdown the delayed_work.  !WB_registered
+        * tells wb_workfn() that @wb is dying and its work_list needs to
+        * be drained no matter what.
         */
-       bdi_remove_from_list(bdi);
+       mod_delayed_work(bdi_wq, &wb->dwork, 0);
+       flush_delayed_work(&wb->dwork);
+       WARN_ON(!list_empty(&wb->work_list));
+}
+
+static void wb_exit(struct bdi_writeback *wb)
+{
+       int i;
+
+       WARN_ON(delayed_work_pending(&wb->dwork));
+
+       for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+               percpu_counter_destroy(&wb->stat[i]);
+
+       fprop_local_destroy_percpu(&wb->completions);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#include <linux/memcontrol.h>
+
+/*
+ * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
+ * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
+ * protected.  cgwb_release_wait is used to wait for the completion of cgwb
+ * releases from bdi destruction path.
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
+
+/**
+ * wb_congested_get_create - get or create a wb_congested
+ * @bdi: associated bdi
+ * @blkcg_id: ID of the associated blkcg
+ * @gfp: allocation mask
+ *
+ * Look up the wb_congested for @blkcg_id on @bdi.  If missing, create one.
+ * The returned wb_congested has its reference count incremented.  Returns
+ * NULL on failure.
+ */
+struct bdi_writeback_congested *
+wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+{
+       struct bdi_writeback_congested *new_congested = NULL, *congested;
+       struct rb_node **node, *parent;
+       unsigned long flags;
+
+       if (blkcg_id == 1)
+               return &bdi->wb_congested;
+retry:
+       spin_lock_irqsave(&cgwb_lock, flags);
+
+       node = &bdi->cgwb_congested_tree.rb_node;
+       parent = NULL;
+
+       while (*node != NULL) {
+               parent = *node;
+               congested = container_of(parent, struct bdi_writeback_congested,
+                                        rb_node);
+               if (congested->blkcg_id < blkcg_id)
+                       node = &parent->rb_left;
+               else if (congested->blkcg_id > blkcg_id)
+                       node = &parent->rb_right;
+               else
+                       goto found;
+       }
+
+       if (new_congested) {
+               /* !found and storage for new one already allocated, insert */
+               congested = new_congested;
+               new_congested = NULL;
+               rb_link_node(&congested->rb_node, parent, node);
+               rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
+               atomic_inc(&bdi->usage_cnt);
+               goto found;
+       }
+
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+
+       /* allocate storage for new one and retry */
+       new_congested = kzalloc(sizeof(*new_congested), gfp);
+       if (!new_congested)
+               return NULL;
+
+       atomic_set(&new_congested->refcnt, 0);
+       new_congested->bdi = bdi;
+       new_congested->blkcg_id = blkcg_id;
+       goto retry;
+
+found:
+       atomic_inc(&congested->refcnt);
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       kfree(new_congested);
+       return congested;
+}
+
+/**
+ * wb_congested_put - put a wb_congested
+ * @congested: wb_congested to put
+ *
+ * Put @congested and destroy it if the refcnt reaches zero.
+ */
+void wb_congested_put(struct bdi_writeback_congested *congested)
+{
+       struct backing_dev_info *bdi = congested->bdi;
+       unsigned long flags;
+
+       if (congested->blkcg_id == 1)
+               return;
+
+       local_irq_save(flags);
+       if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+               local_irq_restore(flags);
+               return;
+       }
+
+       rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       kfree(congested);
+
+       if (atomic_dec_and_test(&bdi->usage_cnt))
+               wake_up_all(&cgwb_release_wait);
+}
+
+static void cgwb_release_workfn(struct work_struct *work)
+{
+       struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+                                               release_work);
+       struct backing_dev_info *bdi = wb->bdi;
+
+       wb_shutdown(wb);
+
+       css_put(wb->memcg_css);
+       css_put(wb->blkcg_css);
+       wb_congested_put(wb->congested);
+
+       fprop_local_destroy_percpu(&wb->memcg_completions);
+       percpu_ref_exit(&wb->refcnt);
+       wb_exit(wb);
+       kfree_rcu(wb, rcu);
+
+       if (atomic_dec_and_test(&bdi->usage_cnt))
+               wake_up_all(&cgwb_release_wait);
+}
+
+static void cgwb_release(struct percpu_ref *refcnt)
+{
+       struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+                                               refcnt);
+       schedule_work(&wb->release_work);
+}
+
+static void cgwb_kill(struct bdi_writeback *wb)
+{
+       lockdep_assert_held(&cgwb_lock);
+
+       WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+       list_del(&wb->memcg_node);
+       list_del(&wb->blkcg_node);
+       percpu_ref_kill(&wb->refcnt);
+}
+
+static int cgwb_create(struct backing_dev_info *bdi,
+                      struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+{
+       struct mem_cgroup *memcg;
+       struct cgroup_subsys_state *blkcg_css;
+       struct blkcg *blkcg;
+       struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+       struct bdi_writeback *wb;
+       unsigned long flags;
+       int ret = 0;
+
+       memcg = mem_cgroup_from_css(memcg_css);
+       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+       blkcg = css_to_blkcg(blkcg_css);
+       memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+       blkcg_cgwb_list = &blkcg->cgwb_list;
+
+       /* look up again under lock and discard on blkcg mismatch */
+       spin_lock_irqsave(&cgwb_lock, flags);
+       wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+       if (wb && wb->blkcg_css != blkcg_css) {
+               cgwb_kill(wb);
+               wb = NULL;
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       if (wb)
+               goto out_put;
+
+       /* need to create a new one */
+       wb = kmalloc(sizeof(*wb), gfp);
+       if (!wb)
+               return -ENOMEM;
+
+       ret = wb_init(wb, bdi, gfp);
+       if (ret)
+               goto err_free;
+
+       ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+       if (ret)
+               goto err_wb_exit;
+
+       ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+       if (ret)
+               goto err_ref_exit;
+
+       wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
+       if (!wb->congested) {
+               ret = -ENOMEM;
+               goto err_fprop_exit;
+       }
+
+       wb->memcg_css = memcg_css;
+       wb->blkcg_css = blkcg_css;
+       INIT_WORK(&wb->release_work, cgwb_release_workfn);
+       set_bit(WB_registered, &wb->state);
 
        /*
-        * Drain work list and shutdown the delayed_work.  At this point,
-        * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
-        * is dying and its work_list needs to be drained no matter what.
+        * The root wb determines the registered state of the whole bdi and
+        * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+        * whether they're still online.  Don't link @wb if any is dead.
+        * See wb_memcg_offline() and wb_blkcg_offline().
         */
-       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-       flush_delayed_work(&bdi->wb.dwork);
+       ret = -ENODEV;
+       spin_lock_irqsave(&cgwb_lock, flags);
+       if (test_bit(WB_registered, &bdi->wb.state) &&
+           blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+               /* we might have raced another instance of this function */
+               ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+               if (!ret) {
+                       atomic_inc(&bdi->usage_cnt);
+                       list_add(&wb->memcg_node, memcg_cgwb_list);
+                       list_add(&wb->blkcg_node, blkcg_cgwb_list);
+                       css_get(memcg_css);
+                       css_get(blkcg_css);
+               }
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       if (ret) {
+               if (ret == -EEXIST)
+                       ret = 0;
+               goto err_put_congested;
+       }
+       goto out_put;
+
+err_put_congested:
+       wb_congested_put(wb->congested);
+err_fprop_exit:
+       fprop_local_destroy_percpu(&wb->memcg_completions);
+err_ref_exit:
+       percpu_ref_exit(&wb->refcnt);
+err_wb_exit:
+       wb_exit(wb);
+err_free:
+       kfree(wb);
+out_put:
+       css_put(blkcg_css);
+       return ret;
 }
 
-static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
+ * create one.  The returned wb has its refcount incremented.
+ *
+ * This function uses css_get() on @memcg_css and thus expects its refcnt
+ * to be positive on invocation.  IOW, rcu_read_lock() protection on
+ * @memcg_css isn't enough.  try_get it before calling this function.
+ *
+ * A wb is keyed by its associated memcg.  As blkcg implicitly enables
+ * memcg on the default hierarchy, memcg association is guaranteed to be
+ * more specific (equal or descendant to the associated blkcg) and thus can
+ * identify both the memcg and blkcg associations.
+ *
+ * Because the blkcg associated with a memcg may change as blkcg is enabled
+ * and disabled closer to root in the hierarchy, each wb keeps track of
+ * both the memcg and blkcg associated with it and verifies the blkcg on
+ * each lookup.  On mismatch, the existing wb is discarded and a new one is
+ * created.
+ */
+struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+                                   struct cgroup_subsys_state *memcg_css,
+                                   gfp_t gfp)
 {
-       memset(wb, 0, sizeof(*wb));
+       struct bdi_writeback *wb;
+
+       might_sleep_if(gfp & __GFP_WAIT);
+
+       if (!memcg_css->parent)
+               return &bdi->wb;
+
+       do {
+               rcu_read_lock();
+               wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+               if (wb) {
+                       struct cgroup_subsys_state *blkcg_css;
+
+                       /* see whether the blkcg association has changed */
+                       blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
+                                                    &blkio_cgrp_subsys);
+                       if (unlikely(wb->blkcg_css != blkcg_css ||
+                                    !wb_tryget(wb)))
+                               wb = NULL;
+                       css_put(blkcg_css);
+               }
+               rcu_read_unlock();
+       } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+
+       return wb;
+}
 
-       wb->bdi = bdi;
-       wb->last_old_flush = jiffies;
-       INIT_LIST_HEAD(&wb->b_dirty);
-       INIT_LIST_HEAD(&wb->b_io);
-       INIT_LIST_HEAD(&wb->b_more_io);
-       INIT_LIST_HEAD(&wb->b_dirty_time);
-       spin_lock_init(&wb->list_lock);
-       INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+static void cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+       bdi->wb.memcg_css = mem_cgroup_root_css;
+       bdi->wb.blkcg_css = blkcg_root_css;
+       bdi->wb_congested.blkcg_id = 1;
+       INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+       bdi->cgwb_congested_tree = RB_ROOT;
+       atomic_set(&bdi->usage_cnt, 1);
 }
 
-/*
- * Initial write bandwidth: 100 MB/s
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+{
+       struct radix_tree_iter iter;
+       void **slot;
+
+       WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+       spin_lock_irq(&cgwb_lock);
+       radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+               cgwb_kill(*slot);
+       spin_unlock_irq(&cgwb_lock);
+
+       /*
+        * All cgwb's and their congested states must be shutdown and
+        * released before returning.  Drain the usage counter to wait for
+        * all cgwb's and cgwb_congested's ever created on @bdi.
+        */
+       atomic_dec(&bdi->usage_cnt);
+       wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+}
+
+/**
+ * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+ * @memcg: memcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @memcg.
  */
-#define INIT_BW                (100 << (20 - PAGE_SHIFT))
+void wb_memcg_offline(struct mem_cgroup *memcg)
+{
+       LIST_HEAD(to_destroy);
+       struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+       struct bdi_writeback *wb, *next;
+
+       spin_lock_irq(&cgwb_lock);
+       list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+               cgwb_kill(wb);
+       memcg_cgwb_list->next = NULL;   /* prevent new wb's */
+       spin_unlock_irq(&cgwb_lock);
+}
+
+/**
+ * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+ * @blkcg: blkcg being offlined
+ *
+ * Also prevents creation of any new wb's associated with @blkcg.
+ */
+void wb_blkcg_offline(struct blkcg *blkcg)
+{
+       LIST_HEAD(to_destroy);
+       struct bdi_writeback *wb, *next;
+
+       spin_lock_irq(&cgwb_lock);
+       list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+               cgwb_kill(wb);
+       blkcg->cgwb_list.next = NULL;   /* prevent new wb's */
+       spin_unlock_irq(&cgwb_lock);
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
 
 int bdi_init(struct backing_dev_info *bdi)
 {
-       int i, err;
+       int err;
 
        bdi->dev = NULL;
 
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
-       spin_lock_init(&bdi->wb_lock);
        INIT_LIST_HEAD(&bdi->bdi_list);
-       INIT_LIST_HEAD(&bdi->work_list);
+       init_waitqueue_head(&bdi->wb_waitq);
 
-       bdi_wb_init(&bdi->wb, bdi);
+       err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+       if (err)
+               return err;
 
-       for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-               err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
-               if (err)
-                       goto err;
-       }
+       bdi->wb_congested.state = 0;
+       bdi->wb.congested = &bdi->wb_congested;
 
-       bdi->dirty_exceeded = 0;
+       cgwb_bdi_init(bdi);
+       return 0;
+}
+EXPORT_SYMBOL(bdi_init);
 
-       bdi->bw_time_stamp = jiffies;
-       bdi->written_stamp = 0;
+int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+               const char *fmt, ...)
+{
+       va_list args;
+       struct device *dev;
 
-       bdi->balanced_dirty_ratelimit = INIT_BW;
-       bdi->dirty_ratelimit = INIT_BW;
-       bdi->write_bandwidth = INIT_BW;
-       bdi->avg_write_bandwidth = INIT_BW;
+       if (bdi->dev)   /* The driver needs to use separate queues per device */
+               return 0;
 
-       err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+       va_start(args, fmt);
+       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+       va_end(args);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
 
-       if (err) {
-err:
-               while (i--)
-                       percpu_counter_destroy(&bdi->bdi_stat[i]);
-       }
+       bdi->dev = dev;
 
-       return err;
+       bdi_debug_register(bdi, dev_name(dev));
+       set_bit(WB_registered, &bdi->wb.state);
+
+       spin_lock_bh(&bdi_lock);
+       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+       spin_unlock_bh(&bdi_lock);
+
+       trace_writeback_bdi_register(bdi);
+       return 0;
 }
-EXPORT_SYMBOL(bdi_init);
+EXPORT_SYMBOL(bdi_register);
 
-void bdi_destroy(struct backing_dev_info *bdi)
+int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
 {
-       int i;
+       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+}
+EXPORT_SYMBOL(bdi_register_dev);
+
+/*
+ * Remove bdi from bdi_list, and ensure that it is no longer visible
+ */
+static void bdi_remove_from_list(struct backing_dev_info *bdi)
+{
+       spin_lock_bh(&bdi_lock);
+       list_del_rcu(&bdi->bdi_list);
+       spin_unlock_bh(&bdi_lock);
 
-       bdi_wb_shutdown(bdi);
-       bdi_set_min_ratio(bdi, 0);
+       synchronize_rcu_expedited();
+}
 
-       WARN_ON(!list_empty(&bdi->work_list));
-       WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+void bdi_destroy(struct backing_dev_info *bdi)
+{
+       /* make sure nobody finds us on the bdi_list anymore */
+       bdi_remove_from_list(bdi);
+       wb_shutdown(&bdi->wb);
+       cgwb_bdi_destroy(bdi);
 
        if (bdi->dev) {
                bdi_debug_unregister(bdi);
@@ -437,9 +821,7 @@ void bdi_destroy(struct backing_dev_info *bdi)
                bdi->dev = NULL;
        }
 
-       for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
-               percpu_counter_destroy(&bdi->bdi_stat[i]);
-       fprop_local_destroy_percpu(&bdi->completions);
+       wb_exit(&bdi->wb);
 }
 EXPORT_SYMBOL(bdi_destroy);
 
@@ -472,31 +854,31 @@ static wait_queue_head_t congestion_wqh[2] = {
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
-static atomic_t nr_bdi_congested[2];
+static atomic_t nr_wb_congested[2];
 
-void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
 {
-       enum bdi_state bit;
        wait_queue_head_t *wqh = &congestion_wqh[sync];
+       enum wb_state bit;
 
-       bit = sync ? BDI_sync_congested : BDI_async_congested;
-       if (test_and_clear_bit(bit, &bdi->state))
-               atomic_dec(&nr_bdi_congested[sync]);
+       bit = sync ? WB_sync_congested : WB_async_congested;
+       if (test_and_clear_bit(bit, &congested->state))
+               atomic_dec(&nr_wb_congested[sync]);
        smp_mb__after_atomic();
        if (waitqueue_active(wqh))
                wake_up(wqh);
 }
-EXPORT_SYMBOL(clear_bdi_congested);
+EXPORT_SYMBOL(clear_wb_congested);
 
-void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
 {
-       enum bdi_state bit;
+       enum wb_state bit;
 
-       bit = sync ? BDI_sync_congested : BDI_async_congested;
-       if (!test_and_set_bit(bit, &bdi->state))
-               atomic_inc(&nr_bdi_congested[sync]);
+       bit = sync ? WB_sync_congested : WB_async_congested;
+       if (!test_and_set_bit(bit, &congested->state))
+               atomic_inc(&nr_wb_congested[sync]);
 }
-EXPORT_SYMBOL(set_bdi_congested);
+EXPORT_SYMBOL(set_wb_congested);
 
 /**
  * congestion_wait - wait for a backing_dev to become uncongested
@@ -555,7 +937,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
         * encountered in the current zone, yield if necessary instead
         * of sleeping on the congestion queue
         */
-       if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+       if (atomic_read(&nr_wb_congested[sync]) == 0 ||
            !test_bit(ZONE_CONGESTED, &zone->flags)) {
                cond_resched();
 
index 4a3907cf79f89ac934331006b05636da20d8f319..b8a5bc66b0c09b915237ab600cdeceaa81b8d153 100644 (file)
@@ -115,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
        case POSIX_FADV_NOREUSE:
                break;
        case POSIX_FADV_DONTNEED:
-               if (!bdi_write_congested(bdi))
+               if (!inode_write_congested(mapping->host))
                        __filemap_fdatawrite_range(mapping, offset, endbyte,
                                                   WB_SYNC_NONE);
 
index 8d17ceea8dbeb1f641687407f2a27ebbff480533..11f10efd637c2d67e071c482951e2bb38a105d6b 100644 (file)
  *    ->tree_lock              (page_remove_rmap->set_page_dirty)
  *    bdi.wb->list_lock                (page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock          (page_remove_rmap->set_page_dirty)
+ *    ->memcg->move_lock       (page_remove_rmap->mem_cgroup_begin_page_stat)
  *    bdi.wb->list_lock                (zap_pte_range->set_page_dirty)
  *    ->inode->i_lock          (zap_pte_range->set_page_dirty)
  *    ->private_lock           (zap_pte_range->__set_page_dirty_buffers)
@@ -174,9 +175,11 @@ static void page_cache_tree_delete(struct address_space *mapping,
 /*
  * Delete a page from the page cache and free it. Caller has to make
  * sure the page is locked and that nobody else uses it - or that usage
- * is safe.  The caller must hold the mapping's tree_lock.
+ * is safe.  The caller must hold the mapping's tree_lock and
+ * mem_cgroup_begin_page_stat().
  */
-void __delete_from_page_cache(struct page *page, void *shadow)
+void __delete_from_page_cache(struct page *page, void *shadow,
+                             struct mem_cgroup *memcg)
 {
        struct address_space *mapping = page->mapping;
 
@@ -212,7 +215,8 @@ void __delete_from_page_cache(struct page *page, void *shadow)
         * anyway will be cleared before returning page into buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
-               account_page_cleaned(page, mapping);
+               account_page_cleaned(page, mapping, memcg,
+                                    inode_to_wb(mapping->host));
 }
 
 /**
@@ -226,14 +230,20 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 void delete_from_page_cache(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+       struct mem_cgroup *memcg;
+       unsigned long flags;
+
        void (*freepage)(struct page *);
 
        BUG_ON(!PageLocked(page));
 
        freepage = mapping->a_ops->freepage;
-       spin_lock_irq(&mapping->tree_lock);
-       __delete_from_page_cache(page, NULL);
-       spin_unlock_irq(&mapping->tree_lock);
+
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
+       __delete_from_page_cache(page, NULL, memcg);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
 
        if (freepage)
                freepage(page);
@@ -283,7 +293,9 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
        if (!mapping_cap_writeback_dirty(mapping))
                return 0;
 
+       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
        ret = do_writepages(mapping, &wbc);
+       wbc_detach_inode(&wbc);
        return ret;
 }
 
@@ -472,6 +484,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
+               struct mem_cgroup *memcg;
+               unsigned long flags;
 
                pgoff_t offset = old->index;
                freepage = mapping->a_ops->freepage;
@@ -480,8 +494,9 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                new->mapping = mapping;
                new->index = offset;
 
-               spin_lock_irq(&mapping->tree_lock);
-               __delete_from_page_cache(old, NULL);
+               memcg = mem_cgroup_begin_page_stat(old);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+               __delete_from_page_cache(old, NULL, memcg);
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
@@ -493,7 +508,8 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
                        __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                mem_cgroup_migrate(old, new, true);
                radix_tree_preload_end();
                if (freepage)
index d551475517bfd8867dca18ea23657216a0d2364a..64bb8a22110c23b7989256b02c7d4851b89aab3f 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/swap.h>
 #include <linux/swapops.h>
 
index e65f7b0131d3598cb5ba0ce3497d47b43d676dea..acb93c554f6e8456dc9312734162317d1adea54d 100644 (file)
@@ -77,6 +77,7 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 #define MEM_CGROUP_RECLAIM_RETRIES     5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
+struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
 
 /* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
@@ -90,6 +91,7 @@ static const char * const mem_cgroup_stat_names[] = {
        "rss",
        "rss_huge",
        "mapped_file",
+       "dirty",
        "writeback",
        "swap",
 };
@@ -322,11 +324,6 @@ struct mem_cgroup {
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
-       /*
-        * used when a cpu is offlined or other synchronizations
-        * See mem_cgroup_read_stat().
-        */
-       struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
 
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@ -346,6 +343,11 @@ struct mem_cgroup {
        atomic_t        numainfo_updating;
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head cgwb_list;
+       struct wb_domain cgwb_domain;
+#endif
+
        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;
@@ -596,6 +598,39 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
        return &memcg->css;
 }
 
+/**
+ * mem_cgroup_css_from_page - css of the memcg associated with a page
+ * @page: page of interest
+ *
+ * If memcg is bound to the default hierarchy, css of the memcg associated
+ * with @page is returned.  The returned css remains associated with @page
+ * until it is released.
+ *
+ * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+ * is returned.
+ *
+ * XXX: The above description of behavior on the default hierarchy isn't
+ * strictly true yet as replace_page_cache_page() can modify the
+ * association before @page is released even on the default hierarchy;
+ * however, the current and planned usages don't mix the the two functions
+ * and replace_page_cache_page() will soon be updated to make the invariant
+ * actually true.
+ */
+struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+{
+       struct mem_cgroup *memcg;
+
+       rcu_read_lock();
+
+       memcg = page->mem_cgroup;
+
+       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+               memcg = root_mem_cgroup;
+
+       rcu_read_unlock();
+       return &memcg->css;
+}
+
 static struct mem_cgroup_per_zone *
 mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -795,15 +830,8 @@ static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
        long val = 0;
        int cpu;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.count[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
        return val;
 }
 
@@ -813,15 +841,8 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
        unsigned long val = 0;
        int cpu;
 
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.events[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
-#endif
-       put_online_cpus();
        return val;
 }
 
@@ -2020,6 +2041,7 @@ again:
 
        return memcg;
 }
+EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
 
 /**
  * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@ -2038,6 +2060,7 @@ void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
 
        rcu_read_unlock();
 }
+EXPORT_SYMBOL(mem_cgroup_end_page_stat);
 
 /**
  * mem_cgroup_update_page_stat - update page state statistics
@@ -2178,37 +2201,12 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
        mutex_unlock(&percpu_charge_mutex);
 }
 
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-       int i;
-
-       spin_lock(&memcg->pcp_counter_lock);
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long x = per_cpu(memcg->stat->count[i], cpu);
-
-               per_cpu(memcg->stat->count[i], cpu) = 0;
-               memcg->nocpu_base.count[i] += x;
-       }
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-               unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-               per_cpu(memcg->stat->events[i], cpu) = 0;
-               memcg->nocpu_base.events[i] += x;
-       }
-       spin_unlock(&memcg->pcp_counter_lock);
-}
-
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                        unsigned long action,
                                        void *hcpu)
 {
        int cpu = (unsigned long)hcpu;
        struct memcg_stock_pcp *stock;
-       struct mem_cgroup *iter;
 
        if (action == CPU_ONLINE)
                return NOTIFY_OK;
@@ -2216,9 +2214,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
 
-       for_each_mem_cgroup(iter)
-               mem_cgroup_drain_pcp_counter(iter, cpu);
-
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
        return NOTIFY_OK;
@@ -4004,6 +3999,98 @@ static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 }
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+{
+       return &memcg->cgwb_list;
+}
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return wb_domain_init(&memcg->cgwb_domain, gfp);
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+       wb_domain_exit(&memcg->cgwb_domain);
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+       wb_domain_size_changed(&memcg->cgwb_domain);
+}
+
+struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+
+       if (!memcg->css.parent)
+               return NULL;
+
+       return &memcg->cgwb_domain;
+}
+
+/**
+ * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+ * @wb: bdi_writeback in question
+ * @pavail: out parameter for number of available pages
+ * @pdirty: out parameter for number of dirty pages
+ * @pwriteback: out parameter for number of pages under writeback
+ *
+ * Determine the numbers of available, dirty, and writeback pages in @wb's
+ * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
+ * more involved.
+ *
+ * A memcg's headroom is "min(max, high) - used".  The available memory is
+ * calculated as the lowest headroom of itself and the ancestors plus the
+ * number of pages already being used for file pages.  Note that this
+ * doesn't consider the actual amount of available memory in the system.
+ * The caller should further cap *@pavail accordingly.
+ */
+void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+       unsigned long head_room = PAGE_COUNTER_MAX;
+       unsigned long file_pages;
+
+       *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+
+       /* this should eventually include NR_UNSTABLE_NFS */
+       *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+
+       file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                   (1 << LRU_ACTIVE_FILE));
+       while ((parent = parent_mem_cgroup(memcg))) {
+               unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+               unsigned long used = page_counter_read(&memcg->memory);
+
+               head_room = min(head_room, ceiling - min(ceiling, used));
+               memcg = parent;
+       }
+
+       *pavail = file_pages + head_room;
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+{
+       return 0;
+}
+
+static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+{
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * DO NOT USE IN NEW FILES.
  *
@@ -4388,9 +4475,15 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!memcg->stat)
                goto out_free;
+
+       if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+               goto out_free_stat;
+
        spin_lock_init(&memcg->pcp_counter_lock);
        return memcg;
 
+out_free_stat:
+       free_percpu(memcg->stat);
 out_free:
        kfree(memcg);
        return NULL;
@@ -4417,6 +4510,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
                free_mem_cgroup_per_zone_info(memcg, node);
 
        free_percpu(memcg->stat);
+       memcg_wb_domain_exit(memcg);
        kfree(memcg);
 }
 
@@ -4449,6 +4543,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        /* root ? */
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
+               mem_cgroup_root_css = &memcg->css;
                page_counter_init(&memcg->memory, NULL);
                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -4467,7 +4562,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
 #endif
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&memcg->cgwb_list);
+#endif
        return &memcg->css;
 
 free_out:
@@ -4555,6 +4652,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        vmpressure_cleanup(&memcg->vmpressure);
 
        memcg_deactivate_kmem(memcg);
+
+       wb_memcg_offline(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4588,6 +4687,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        memcg->low = 0;
        memcg->high = PAGE_COUNTER_MAX;
        memcg->soft_limit = PAGE_COUNTER_MAX;
+       memcg_wb_domain_size_changed(memcg);
 }
 
 #ifdef CONFIG_MMU
@@ -4757,6 +4857,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
        unsigned long flags;
        int ret;
+       bool anon;
 
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -4782,15 +4883,33 @@ static int mem_cgroup_move_account(struct page *page,
        if (page->mem_cgroup != from)
                goto out_unlock;
 
+       anon = PageAnon(page);
+
        spin_lock_irqsave(&from->move_lock, flags);
 
-       if (!PageAnon(page) && page_mapped(page)) {
+       if (!anon && page_mapped(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
        }
 
+       /*
+        * move_lock grabbed above and caller set from->moving_account, so
+        * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+        * So mapping should be stable for dirty pages.
+        */
+       if (!anon && PageDirty(page)) {
+               struct address_space *mapping = page_mapping(page);
+
+               if (mapping_cap_account_dirty(mapping)) {
+                       __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+                       __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+               }
+       }
+
        if (PageWriteback(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
                               nr_pages);
@@ -5306,6 +5425,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
 
        memcg->high = high;
 
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
 }
 
@@ -5338,6 +5458,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
 }
 
index eb59f7eea50827fc09e1c4f7a432b59ff2241d17..22cddd3e5de8433952e99438d3260ae9ff20bd8d 100644 (file)
@@ -122,31 +122,31 @@ EXPORT_SYMBOL(laptop_mode);
 
 /* End of sysctl-exported parameters */
 
-unsigned long global_dirty_limit;
+struct wb_domain global_wb_domain;
 
-/*
- * Scale the writeback cache size proportional to the relative writeout speeds.
- *
- * We do this by keeping a floating proportion between BDIs, based on page
- * writeback completions [end_page_writeback()]. Those devices that write out
- * pages fastest will get the larger share, while the slower will get a smaller
- * share.
- *
- * We use page writeout completions because we are interested in getting rid of
- * dirty pages. Having them written out is the primary goal.
- *
- * We introduce a concept of time, a period over which we measure these events,
- * because demand can/will vary over time. The length of this period itself is
- * measured in page writeback completions.
- *
- */
-static struct fprop_global writeout_completions;
+/* consolidated parameters for balance_dirty_pages() and its subroutines */
+struct dirty_throttle_control {
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct wb_domain        *dom;
+       struct dirty_throttle_control *gdtc;    /* only set in memcg dtc's */
+#endif
+       struct bdi_writeback    *wb;
+       struct fprop_local_percpu *wb_completions;
 
-static void writeout_period(unsigned long t);
-/* Timer for aging of writeout_completions */
-static struct timer_list writeout_period_timer =
-               TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
-static unsigned long writeout_period_time = 0;
+       unsigned long           avail;          /* dirtyable */
+       unsigned long           dirty;          /* file_dirty + write + nfs */
+       unsigned long           thresh;         /* dirty threshold */
+       unsigned long           bg_thresh;      /* dirty background threshold */
+
+       unsigned long           wb_dirty;       /* per-wb counterparts */
+       unsigned long           wb_thresh;
+       unsigned long           wb_bg_thresh;
+
+       unsigned long           pos_ratio;
+};
+
+#define DTC_INIT_COMMON(__wb)  .wb = (__wb),                           \
+                               .wb_completions = &(__wb)->completions
 
 /*
  * Length of period for aging writeout fractions of bdis. This is an
@@ -155,6 +155,97 @@ static unsigned long writeout_period_time = 0;
  */
 #define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+#define GDTC_INIT(__wb)                .dom = &global_wb_domain,               \
+                               DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB                .dom = &global_wb_domain
+#define MDTC_INIT(__wb, __gdtc)        .dom = mem_cgroup_wb_domain(__wb),      \
+                               .gdtc = __gdtc,                         \
+                               DTC_INIT_COMMON(__wb)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+       return dtc->dom;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+       return dtc->dom;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+       return mdtc->gdtc;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+       return &wb->memcg_completions;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+                            unsigned long *minp, unsigned long *maxp)
+{
+       unsigned long this_bw = wb->avg_write_bandwidth;
+       unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+       unsigned long long min = wb->bdi->min_ratio;
+       unsigned long long max = wb->bdi->max_ratio;
+
+       /*
+        * @wb may already be clean by the time control reaches here and
+        * the total may not include its bw.
+        */
+       if (this_bw < tot_bw) {
+               if (min) {
+                       min *= this_bw;
+                       do_div(min, tot_bw);
+               }
+               if (max < 100) {
+                       max *= this_bw;
+                       do_div(max, tot_bw);
+               }
+       }
+
+       *minp = min;
+       *maxp = max;
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+#define GDTC_INIT(__wb)                DTC_INIT_COMMON(__wb)
+#define GDTC_INIT_NO_WB
+#define MDTC_INIT(__wb, __gdtc)
+
+static bool mdtc_valid(struct dirty_throttle_control *dtc)
+{
+       return false;
+}
+
+static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+{
+       return &global_wb_domain;
+}
+
+static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+{
+       return NULL;
+}
+
+static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+{
+       return NULL;
+}
+
+static void wb_min_max_ratio(struct bdi_writeback *wb,
+                            unsigned long *minp, unsigned long *maxp)
+{
+       *minp = wb->bdi->min_ratio;
+       *maxp = wb->bdi->max_ratio;
+}
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
 /*
  * In a memory zone, there is a certain amount of pages we consider
  * available for the page cache, which is essentially the number of
@@ -250,42 +341,88 @@ static unsigned long global_dirtyable_memory(void)
        return x + 1;   /* Ensure that we never return 0 */
 }
 
-/*
- * global_dirty_limits - background-writeback and dirty-throttling thresholds
+/**
+ * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
+ * @dtc: dirty_throttle_control of interest
  *
- * Calculate the dirty thresholds based on sysctl parameters
- * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
- * - vm.dirty_ratio             or  vm.dirty_bytes
- * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * Calculate @dtc->thresh and ->bg_thresh considering
+ * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
+ * must ensure that @dtc->avail is set before calling this function.  The
+ * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
  * real-time tasks.
  */
-void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+static void domain_dirty_limits(struct dirty_throttle_control *dtc)
 {
-       const unsigned long available_memory = global_dirtyable_memory();
-       unsigned long background;
-       unsigned long dirty;
+       const unsigned long available_memory = dtc->avail;
+       struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
+       unsigned long bytes = vm_dirty_bytes;
+       unsigned long bg_bytes = dirty_background_bytes;
+       unsigned long ratio = vm_dirty_ratio;
+       unsigned long bg_ratio = dirty_background_ratio;
+       unsigned long thresh;
+       unsigned long bg_thresh;
        struct task_struct *tsk;
 
-       if (vm_dirty_bytes)
-               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+       /* gdtc is !NULL iff @dtc is for memcg domain */
+       if (gdtc) {
+               unsigned long global_avail = gdtc->avail;
+
+               /*
+                * The byte settings can't be applied directly to memcg
+                * domains.  Convert them to ratios by scaling against
+                * globally available memory.
+                */
+               if (bytes)
+                       ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
+                                   global_avail, 100UL);
+               if (bg_bytes)
+                       bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
+                                      global_avail, 100UL);
+               bytes = bg_bytes = 0;
+       }
+
+       if (bytes)
+               thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
-               dirty = (vm_dirty_ratio * available_memory) / 100;
+               thresh = (ratio * available_memory) / 100;
 
-       if (dirty_background_bytes)
-               background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+       if (bg_bytes)
+               bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
-               background = (dirty_background_ratio * available_memory) / 100;
+               bg_thresh = (bg_ratio * available_memory) / 100;
 
-       if (background >= dirty)
-               background = dirty / 2;
+       if (bg_thresh >= thresh)
+               bg_thresh = thresh / 2;
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-               background += background / 4;
-               dirty += dirty / 4;
+               bg_thresh += bg_thresh / 4;
+               thresh += thresh / 4;
        }
-       *pbackground = background;
-       *pdirty = dirty;
-       trace_global_dirty_state(background, dirty);
+       dtc->thresh = thresh;
+       dtc->bg_thresh = bg_thresh;
+
+       /* we should eventually report the domain in the TP */
+       if (!gdtc)
+               trace_global_dirty_state(bg_thresh, thresh);
+}
+
+/**
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ * @pbackground: out parameter for bg_thresh
+ * @pdirty: out parameter for thresh
+ *
+ * Calculate bg_thresh and thresh for global_wb_domain.  See
+ * domain_dirty_limits() for details.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+{
+       struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+
+       gdtc.avail = global_dirtyable_memory();
+       domain_dirty_limits(&gdtc);
+
+       *pbackground = gdtc.bg_thresh;
+       *pdirty = gdtc.thresh;
 }
 
 /**
@@ -392,47 +529,52 @@ static unsigned long wp_next_time(unsigned long cur_time)
        return cur_time;
 }
 
-/*
- * Increment the BDI's writeout completion count and the global writeout
- * completion count. Called from test_clear_page_writeback().
- */
-static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+static void wb_domain_writeout_inc(struct wb_domain *dom,
+                                  struct fprop_local_percpu *completions,
+                                  unsigned int max_prop_frac)
 {
-       __inc_bdi_stat(bdi, BDI_WRITTEN);
-       __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-                              bdi->max_prop_frac);
+       __fprop_inc_percpu_max(&dom->completions, completions,
+                              max_prop_frac);
        /* First event after period switching was turned off? */
-       if (!unlikely(writeout_period_time)) {
+       if (!unlikely(dom->period_time)) {
                /*
                 * We can race with other __bdi_writeout_inc calls here but
                 * it does not cause any harm since the resulting time when
                 * timer will fire and what is in writeout_period_time will be
                 * roughly the same.
                 */
-               writeout_period_time = wp_next_time(jiffies);
-               mod_timer(&writeout_period_timer, writeout_period_time);
+               dom->period_time = wp_next_time(jiffies);
+               mod_timer(&dom->period_timer, dom->period_time);
        }
 }
 
-void bdi_writeout_inc(struct backing_dev_info *bdi)
+/*
+ * Increment @wb's writeout completion count and the global writeout
+ * completion count. Called from test_clear_page_writeback().
+ */
+static inline void __wb_writeout_inc(struct bdi_writeback *wb)
 {
-       unsigned long flags;
+       struct wb_domain *cgdom;
 
-       local_irq_save(flags);
-       __bdi_writeout_inc(bdi);
-       local_irq_restore(flags);
+       __inc_wb_stat(wb, WB_WRITTEN);
+       wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+                              wb->bdi->max_prop_frac);
+
+       cgdom = mem_cgroup_wb_domain(wb);
+       if (cgdom)
+               wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+                                      wb->bdi->max_prop_frac);
 }
-EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 
-/*
- * Obtain an accurate fraction of the BDI's portion.
- */
-static void bdi_writeout_fraction(struct backing_dev_info *bdi,
-               long *numerator, long *denominator)
+void wb_writeout_inc(struct bdi_writeback *wb)
 {
-       fprop_fraction_percpu(&writeout_completions, &bdi->completions,
-                               numerator, denominator);
+       unsigned long flags;
+
+       local_irq_save(flags);
+       __wb_writeout_inc(wb);
+       local_irq_restore(flags);
 }
+EXPORT_SYMBOL_GPL(wb_writeout_inc);
 
 /*
  * On idle system, we can be called long after we scheduled because we use
@@ -440,22 +582,46 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
  */
 static void writeout_period(unsigned long t)
 {
-       int miss_periods = (jiffies - writeout_period_time) /
+       struct wb_domain *dom = (void *)t;
+       int miss_periods = (jiffies - dom->period_time) /
                                                 VM_COMPLETIONS_PERIOD_LEN;
 
-       if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
-               writeout_period_time = wp_next_time(writeout_period_time +
+       if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+               dom->period_time = wp_next_time(dom->period_time +
                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
-               mod_timer(&writeout_period_timer, writeout_period_time);
+               mod_timer(&dom->period_timer, dom->period_time);
        } else {
                /*
                 * Aging has zeroed all fractions. Stop wasting CPU on period
                 * updates.
                 */
-               writeout_period_time = 0;
+               dom->period_time = 0;
        }
 }
 
+int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+{
+       memset(dom, 0, sizeof(*dom));
+
+       spin_lock_init(&dom->lock);
+
+       init_timer_deferrable(&dom->period_timer);
+       dom->period_timer.function = writeout_period;
+       dom->period_timer.data = (unsigned long)dom;
+
+       dom->dirty_limit_tstamp = jiffies;
+
+       return fprop_global_init(&dom->completions, gfp);
+}
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+void wb_domain_exit(struct wb_domain *dom)
+{
+       del_timer_sync(&dom->period_timer);
+       fprop_global_destroy(&dom->completions);
+}
+#endif
+
 /*
  * bdi_min_ratio keeps the sum of the minimum dirty shares of all
  * registered backing devices, which, for obvious reasons, can not
@@ -510,17 +676,26 @@ static unsigned long dirty_freerun_ceiling(unsigned long thresh,
        return (thresh + bg_thresh) / 2;
 }
 
-static unsigned long hard_dirty_limit(unsigned long thresh)
+static unsigned long hard_dirty_limit(struct wb_domain *dom,
+                                     unsigned long thresh)
 {
-       return max(thresh, global_dirty_limit);
+       return max(thresh, dom->dirty_limit);
+}
+
+/* memory available to a memcg domain is capped by system-wide clean memory */
+static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+{
+       struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+       unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+
+       mdtc->avail = min(mdtc->avail, clean);
 }
 
 /**
- * bdi_dirty_limit - @bdi's share of dirty throttling threshold
- * @bdi: the backing_dev_info to query
- * @dirty: global dirty limit in pages
+ * __wb_calc_thresh - @wb's share of dirty throttling threshold
+ * @dtc: dirty_throttle_context of interest
  *
- * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * Returns @wb's dirty limit in pages. The term "dirty" in the context of
  * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
  *
  * Note that balance_dirty_pages() will only seriously take it as a hard limit
@@ -528,34 +703,47 @@ static unsigned long hard_dirty_limit(unsigned long thresh)
  * control. For example, when the device is completely stalled due to some error
  * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
  * In the other normal situations, it acts more gently by throttling the tasks
- * more (rather than completely block them) when the bdi dirty pages go high.
+ * more (rather than completely block them) when the wb dirty pages go high.
  *
  * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
- * The bdi's share of dirty limit will be adapting to its throughput and
+ * The wb's share of dirty limit will be adapting to its throughput and
  * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
  */
-unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
 {
-       u64 bdi_dirty;
+       struct wb_domain *dom = dtc_dom(dtc);
+       unsigned long thresh = dtc->thresh;
+       u64 wb_thresh;
        long numerator, denominator;
+       unsigned long wb_min_ratio, wb_max_ratio;
 
        /*
-        * Calculate this BDI's share of the dirty ratio.
+        * Calculate this BDI's share of the thresh ratio.
         */
-       bdi_writeout_fraction(bdi, &numerator, &denominator);
+       fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
+                             &numerator, &denominator);
+
+       wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+       wb_thresh *= numerator;
+       do_div(wb_thresh, denominator);
 
-       bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-       bdi_dirty *= numerator;
-       do_div(bdi_dirty, denominator);
+       wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
 
-       bdi_dirty += (dirty * bdi->min_ratio) / 100;
-       if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-               bdi_dirty = dirty * bdi->max_ratio / 100;
+       wb_thresh += (thresh * wb_min_ratio) / 100;
+       if (wb_thresh > (thresh * wb_max_ratio) / 100)
+               wb_thresh = thresh * wb_max_ratio / 100;
 
-       return bdi_dirty;
+       return wb_thresh;
+}
+
+unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
+{
+       struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
+                                              .thresh = thresh };
+       return __wb_calc_thresh(&gdtc);
 }
 
 /*
@@ -594,7 +782,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *
  * (o) global/bdi setpoints
  *
- * We want the dirty pages be balanced around the global/bdi setpoints.
+ * We want the dirty pages be balanced around the global/wb setpoints.
  * When the number of dirty pages is higher/lower than the setpoint, the
  * dirty position control ratio (and hence task dirty ratelimit) will be
  * decreased/increased to bring the dirty pages back to the setpoint.
@@ -604,8 +792,8 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *     if (dirty < setpoint) scale up   pos_ratio
  *     if (dirty > setpoint) scale down pos_ratio
  *
- *     if (bdi_dirty < bdi_setpoint) scale up   pos_ratio
- *     if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+ *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
+ *     if (wb_dirty > wb_setpoint) scale down pos_ratio
  *
  *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
  *
@@ -630,7 +818,7 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *   0 +------------.------------------.----------------------*------------->
  *           freerun^          setpoint^                 limit^   dirty pages
  *
- * (o) bdi control line
+ * (o) wb control line
  *
  *     ^ pos_ratio
  *     |
@@ -656,33 +844,32 @@ static long long pos_ratio_polynom(unsigned long setpoint,
  *     |                      .                           .
  *     |                      .                             .
  *   0 +----------------------.-------------------------------.------------->
- *                bdi_setpoint^                    x_intercept^
+ *                wb_setpoint^                    x_intercept^
  *
- * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+ * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
  * be smoothly throttled down to normal if it starts high in situations like
  * - start writing to a slow SD card and a fast disk at the same time. The SD
- *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
- * - the bdi dirty thresh drops quickly due to change of JBOD workload
+ *   card's wb_dirty may rush to many times higher than wb_setpoint.
+ * - the wb dirty thresh drops quickly due to change of JBOD workload
  */
-static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
-                                       unsigned long thresh,
-                                       unsigned long bg_thresh,
-                                       unsigned long dirty,
-                                       unsigned long bdi_thresh,
-                                       unsigned long bdi_dirty)
-{
-       unsigned long write_bw = bdi->avg_write_bandwidth;
-       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-       unsigned long limit = hard_dirty_limit(thresh);
+static void wb_position_ratio(struct dirty_throttle_control *dtc)
+{
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+       unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+       unsigned long wb_thresh = dtc->wb_thresh;
        unsigned long x_intercept;
        unsigned long setpoint;         /* dirty pages' target balance point */
-       unsigned long bdi_setpoint;
+       unsigned long wb_setpoint;
        unsigned long span;
        long long pos_ratio;            /* for scaling up/down the rate limit */
        long x;
 
-       if (unlikely(dirty >= limit))
-               return 0;
+       dtc->pos_ratio = 0;
+
+       if (unlikely(dtc->dirty >= limit))
+               return;
 
        /*
         * global setpoint
@@ -690,165 +877,167 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
         * See comment for pos_ratio_polynom().
         */
        setpoint = (freerun + limit) / 2;
-       pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+       pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
 
        /*
         * The strictlimit feature is a tool preventing mistrusted filesystems
         * from growing a large number of dirty pages before throttling. For
-        * such filesystems balance_dirty_pages always checks bdi counters
-        * against bdi limits. Even if global "nr_dirty" is under "freerun".
+        * such filesystems balance_dirty_pages always checks wb counters
+        * against wb limits. Even if global "nr_dirty" is under "freerun".
         * This is especially important for fuse which sets bdi->max_ratio to
         * 1% by default. Without strictlimit feature, fuse writeback may
         * consume arbitrary amount of RAM because it is accounted in
         * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
         *
-        * Here, in bdi_position_ratio(), we calculate pos_ratio based on
-        * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+        * Here, in wb_position_ratio(), we calculate pos_ratio based on
+        * two values: wb_dirty and wb_thresh. Let's consider an example:
         * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
         * limits are set by default to 10% and 20% (background and throttle).
-        * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
-        * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
-        * about ~6K pages (as the average of background and throttle bdi
+        * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+        * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+        * about ~6K pages (as the average of background and throttle wb
         * limits). The 3rd order polynomial will provide positive feedback if
-        * bdi_dirty is under bdi_setpoint and vice versa.
+        * wb_dirty is under wb_setpoint and vice versa.
         *
         * Note, that we cannot use global counters in these calculations
-        * because we want to throttle process writing to a strictlimit BDI
+        * because we want to throttle process writing to a strictlimit wb
         * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
         * in the example above).
         */
-       if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-               long long bdi_pos_ratio;
-               unsigned long bdi_bg_thresh;
+       if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+               long long wb_pos_ratio;
 
-               if (bdi_dirty < 8)
-                       return min_t(long long, pos_ratio * 2,
-                                    2 << RATELIMIT_CALC_SHIFT);
+               if (dtc->wb_dirty < 8) {
+                       dtc->pos_ratio = min_t(long long, pos_ratio * 2,
+                                          2 << RATELIMIT_CALC_SHIFT);
+                       return;
+               }
 
-               if (bdi_dirty >= bdi_thresh)
-                       return 0;
+               if (dtc->wb_dirty >= wb_thresh)
+                       return;
 
-               bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
-               bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
-                                                    bdi_bg_thresh);
+               wb_setpoint = dirty_freerun_ceiling(wb_thresh,
+                                                   dtc->wb_bg_thresh);
 
-               if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
-                       return 0;
+               if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
+                       return;
 
-               bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
-                                                 bdi_thresh);
+               wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
+                                                wb_thresh);
 
                /*
-                * Typically, for strictlimit case, bdi_setpoint << setpoint
-                * and pos_ratio >> bdi_pos_ratio. In the other words global
+                * Typically, for strictlimit case, wb_setpoint << setpoint
+                * and pos_ratio >> wb_pos_ratio. In the other words global
                 * state ("dirty") is not limiting factor and we have to
-                * make decision based on bdi counters. But there is an
+                * make decision based on wb counters. But there is an
                 * important case when global pos_ratio should get precedence:
                 * global limits are exceeded (e.g. due to activities on other
-                * BDIs) while given strictlimit BDI is below limit.
+                * wb's) while given strictlimit wb is below limit.
                 *
-                * "pos_ratio * bdi_pos_ratio" would work for the case above,
+                * "pos_ratio * wb_pos_ratio" would work for the case above,
                 * but it would look too non-natural for the case of all
-                * activity in the system coming from a single strictlimit BDI
+                * activity in the system coming from a single strictlimit wb
                 * with bdi->max_ratio == 100%.
                 *
                 * Note that min() below somewhat changes the dynamics of the
                 * control system. Normally, pos_ratio value can be well over 3
-                * (when globally we are at freerun and bdi is well below bdi
+                * (when globally we are at freerun and wb is well below wb
                 * setpoint). Now the maximum pos_ratio in the same situation
                 * is 2. We might want to tweak this if we observe the control
                 * system is too slow to adapt.
                 */
-               return min(pos_ratio, bdi_pos_ratio);
+               dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
+               return;
        }
 
        /*
         * We have computed basic pos_ratio above based on global situation. If
-        * the bdi is over/under its share of dirty pages, we want to scale
+        * the wb is over/under its share of dirty pages, we want to scale
         * pos_ratio further down/up. That is done by the following mechanism.
         */
 
        /*
-        * bdi setpoint
+        * wb setpoint
         *
-        *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+        *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
         *
-        *                        x_intercept - bdi_dirty
+        *                        x_intercept - wb_dirty
         *                     := --------------------------
-        *                        x_intercept - bdi_setpoint
+        *                        x_intercept - wb_setpoint
         *
-        * The main bdi control line is a linear function that subjects to
+        * The main wb control line is a linear function that subjects to
         *
-        * (1) f(bdi_setpoint) = 1.0
-        * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
-        *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
+        * (1) f(wb_setpoint) = 1.0
+        * (2) k = - 1 / (8 * write_bw)  (in single wb case)
+        *     or equally: x_intercept = wb_setpoint + 8 * write_bw
         *
-        * For single bdi case, the dirty pages are observed to fluctuate
+        * For single wb case, the dirty pages are observed to fluctuate
         * regularly within range
-        *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+        *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
         * for various filesystems, where (2) can yield in a reasonable 12.5%
         * fluctuation range for pos_ratio.
         *
-        * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+        * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
         * own size, so move the slope over accordingly and choose a slope that
-        * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+        * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
         */
-       if (unlikely(bdi_thresh > thresh))
-               bdi_thresh = thresh;
+       if (unlikely(wb_thresh > dtc->thresh))
+               wb_thresh = dtc->thresh;
        /*
-        * It's very possible that bdi_thresh is close to 0 not because the
+        * It's very possible that wb_thresh is close to 0 not because the
         * device is slow, but that it has remained inactive for long time.
         * Honour such devices a reasonable good (hopefully IO efficient)
         * threshold, so that the occasional writes won't be blocked and active
         * writes can rampup the threshold quickly.
         */
-       bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+       wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
        /*
-        * scale global setpoint to bdi's:
-        *      bdi_setpoint = setpoint * bdi_thresh / thresh
+        * scale global setpoint to wb's:
+        *      wb_setpoint = setpoint * wb_thresh / thresh
         */
-       x = div_u64((u64)bdi_thresh << 16, thresh | 1);
-       bdi_setpoint = setpoint * (u64)x >> 16;
+       x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+       wb_setpoint = setpoint * (u64)x >> 16;
        /*
-        * Use span=(8*write_bw) in single bdi case as indicated by
-        * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+        * Use span=(8*write_bw) in single wb case as indicated by
+        * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
         *
-        *        bdi_thresh                    thresh - bdi_thresh
-        * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
-        *          thresh                            thresh
+        *        wb_thresh                    thresh - wb_thresh
+        * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+        *         thresh                           thresh
         */
-       span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
-       x_intercept = bdi_setpoint + span;
+       span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+       x_intercept = wb_setpoint + span;
 
-       if (bdi_dirty < x_intercept - span / 4) {
-               pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-                                     (x_intercept - bdi_setpoint) | 1);
+       if (dtc->wb_dirty < x_intercept - span / 4) {
+               pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
+                                     (x_intercept - wb_setpoint) | 1);
        } else
                pos_ratio /= 4;
 
        /*
-        * bdi reserve area, safeguard against dirty pool underrun and disk idle
+        * wb reserve area, safeguard against dirty pool underrun and disk idle
         * It may push the desired control point of global dirty pages higher
         * than setpoint.
         */
-       x_intercept = bdi_thresh / 2;
-       if (bdi_dirty < x_intercept) {
-               if (bdi_dirty > x_intercept / 8)
-                       pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+       x_intercept = wb_thresh / 2;
+       if (dtc->wb_dirty < x_intercept) {
+               if (dtc->wb_dirty > x_intercept / 8)
+                       pos_ratio = div_u64(pos_ratio * x_intercept,
+                                           dtc->wb_dirty);
                else
                        pos_ratio *= 8;
        }
 
-       return pos_ratio;
+       dtc->pos_ratio = pos_ratio;
 }
 
-static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
-                                      unsigned long elapsed,
-                                      unsigned long written)
+static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+                                     unsigned long elapsed,
+                                     unsigned long written)
 {
        const unsigned long period = roundup_pow_of_two(3 * HZ);
-       unsigned long avg = bdi->avg_write_bandwidth;
-       unsigned long old = bdi->write_bandwidth;
+       unsigned long avg = wb->avg_write_bandwidth;
+       unsigned long old = wb->write_bandwidth;
        u64 bw;
 
        /*
@@ -861,14 +1050,14 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
         * @written may have decreased due to account_page_redirty().
         * Avoid underflowing @bw calculation.
         */
-       bw = written - min(written, bdi->written_stamp);
+       bw = written - min(written, wb->written_stamp);
        bw *= HZ;
        if (unlikely(elapsed > period)) {
                do_div(bw, elapsed);
                avg = bw;
                goto out;
        }
-       bw += (u64)bdi->write_bandwidth * (period - elapsed);
+       bw += (u64)wb->write_bandwidth * (period - elapsed);
        bw >>= ilog2(period);
 
        /*
@@ -881,21 +1070,22 @@ static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
                avg += (old - avg) >> 3;
 
 out:
-       bdi->write_bandwidth = bw;
-       bdi->avg_write_bandwidth = avg;
+       /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+       avg = max(avg, 1LU);
+       if (wb_has_dirty_io(wb)) {
+               long delta = avg - wb->avg_write_bandwidth;
+               WARN_ON_ONCE(atomic_long_add_return(delta,
+                                       &wb->bdi->tot_write_bandwidth) <= 0);
+       }
+       wb->write_bandwidth = bw;
+       wb->avg_write_bandwidth = avg;
 }
 
-/*
- * The global dirtyable memory and dirty threshold could be suddenly knocked
- * down by a large amount (eg. on the startup of KVM in a swapless system).
- * This may throw the system into deep dirty exceeded state and throttle
- * heavy/light dirtiers alike. To retain good responsiveness, maintain
- * global_dirty_limit for tracking slowly down to the knocked down dirty
- * threshold.
- */
-static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+static void update_dirty_limit(struct dirty_throttle_control *dtc)
 {
-       unsigned long limit = global_dirty_limit;
+       struct wb_domain *dom = dtc_dom(dtc);
+       unsigned long thresh = dtc->thresh;
+       unsigned long limit = dom->dirty_limit;
 
        /*
         * Follow up in one step.
@@ -908,63 +1098,57 @@ static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
        /*
         * Follow down slowly. Use the higher one as the target, because thresh
         * may drop below dirty. This is exactly the reason to introduce
-        * global_dirty_limit which is guaranteed to lie above the dirty pages.
+        * dom->dirty_limit which is guaranteed to lie above the dirty pages.
         */
-       thresh = max(thresh, dirty);
+       thresh = max(thresh, dtc->dirty);
        if (limit > thresh) {
                limit -= (limit - thresh) >> 5;
                goto update;
        }
        return;
 update:
-       global_dirty_limit = limit;
+       dom->dirty_limit = limit;
 }
 
-static void global_update_bandwidth(unsigned long thresh,
-                                   unsigned long dirty,
+static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
                                    unsigned long now)
 {
-       static DEFINE_SPINLOCK(dirty_lock);
-       static unsigned long update_time = INITIAL_JIFFIES;
+       struct wb_domain *dom = dtc_dom(dtc);
 
        /*
         * check locklessly first to optimize away locking for the most time
         */
-       if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+       if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                return;
 
-       spin_lock(&dirty_lock);
-       if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
-               update_dirty_limit(thresh, dirty);
-               update_time = now;
+       spin_lock(&dom->lock);
+       if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
+               update_dirty_limit(dtc);
+               dom->dirty_limit_tstamp = now;
        }
-       spin_unlock(&dirty_lock);
+       spin_unlock(&dom->lock);
 }
 
 /*
- * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+ * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
  *
- * Normal bdi tasks will be curbed at or below it in long term.
+ * Normal wb tasks will be curbed at or below it in long term.
  * Obviously it should be around (write_bw / N) when there are N dd tasks.
  */
-static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
-                                      unsigned long thresh,
-                                      unsigned long bg_thresh,
-                                      unsigned long dirty,
-                                      unsigned long bdi_thresh,
-                                      unsigned long bdi_dirty,
-                                      unsigned long dirtied,
-                                      unsigned long elapsed)
-{
-       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-       unsigned long limit = hard_dirty_limit(thresh);
+static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
+                                     unsigned long dirtied,
+                                     unsigned long elapsed)
+{
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long dirty = dtc->dirty;
+       unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+       unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long setpoint = (freerun + limit) / 2;
-       unsigned long write_bw = bdi->avg_write_bandwidth;
-       unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long dirty_ratelimit = wb->dirty_ratelimit;
        unsigned long dirty_rate;
        unsigned long task_ratelimit;
        unsigned long balanced_dirty_ratelimit;
-       unsigned long pos_ratio;
        unsigned long step;
        unsigned long x;
 
@@ -972,20 +1156,18 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
         * The dirty rate will match the writeout rate in long term, except
         * when dirty pages are truncated by userspace or re-dirtied by FS.
         */
-       dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+       dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
 
-       pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
-                                      bdi_thresh, bdi_dirty);
        /*
         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
         */
        task_ratelimit = (u64)dirty_ratelimit *
-                                       pos_ratio >> RATELIMIT_CALC_SHIFT;
+                                       dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
 
        /*
         * A linear estimation of the "balanced" throttle rate. The theory is,
-        * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+        * if there are N dd tasks, each throttled at task_ratelimit, the wb's
         * dirty_rate will be measured to be (N * task_ratelimit). So the below
         * formula will yield the balanced rate limit (write_bw / N).
         *
@@ -1024,7 +1206,7 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
        /*
         * We could safely do this and return immediately:
         *
-        *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+        *      wb->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
         * code makes use of task_ratelimit to filter out singular points and
@@ -1058,32 +1240,31 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
        step = 0;
 
        /*
-        * For strictlimit case, calculations above were based on bdi counters
-        * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+        * For strictlimit case, calculations above were based on wb counters
+        * and limits (starting from pos_ratio = wb_position_ratio() and up to
         * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
-        * Hence, to calculate "step" properly, we have to use bdi_dirty as
-        * "dirty" and bdi_setpoint as "setpoint".
+        * Hence, to calculate "step" properly, we have to use wb_dirty as
+        * "dirty" and wb_setpoint as "setpoint".
         *
-        * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
-        * it's possible that bdi_thresh is close to zero due to inactivity
-        * of backing device (see the implementation of bdi_dirty_limit()).
+        * We rampup dirty_ratelimit forcibly if wb_dirty is low because
+        * it's possible that wb_thresh is close to zero due to inactivity
+        * of backing device.
         */
-       if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-               dirty = bdi_dirty;
-               if (bdi_dirty < 8)
-                       setpoint = bdi_dirty + 1;
+       if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+               dirty = dtc->wb_dirty;
+               if (dtc->wb_dirty < 8)
+                       setpoint = dtc->wb_dirty + 1;
                else
-                       setpoint = (bdi_thresh +
-                                   bdi_dirty_limit(bdi, bg_thresh)) / 2;
+                       setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
        }
 
        if (dirty < setpoint) {
-               x = min3(bdi->balanced_dirty_ratelimit,
+               x = min3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit < x)
                        step = x - dirty_ratelimit;
        } else {
-               x = max3(bdi->balanced_dirty_ratelimit,
+               x = max3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit > x)
                        step = dirty_ratelimit - x;
@@ -1105,69 +1286,67 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
        else
                dirty_ratelimit -= step;
 
-       bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
-       bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+       wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+       wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
 
-       trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+       trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
 }
 
-void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-                           unsigned long thresh,
-                           unsigned long bg_thresh,
-                           unsigned long dirty,
-                           unsigned long bdi_thresh,
-                           unsigned long bdi_dirty,
-                           unsigned long start_time)
+static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+                                 struct dirty_throttle_control *mdtc,
+                                 unsigned long start_time,
+                                 bool update_ratelimit)
 {
+       struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
-       unsigned long elapsed = now - bdi->bw_time_stamp;
+       unsigned long elapsed = now - wb->bw_time_stamp;
        unsigned long dirtied;
        unsigned long written;
 
+       lockdep_assert_held(&wb->list_lock);
+
        /*
         * rate-limit, only update once every 200ms.
         */
        if (elapsed < BANDWIDTH_INTERVAL)
                return;
 
-       dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
-       written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+       dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+       written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
 
        /*
         * Skip quiet periods when disk bandwidth is under-utilized.
         * (at least 1s idle time between two flusher runs)
         */
-       if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+       if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
                goto snapshot;
 
-       if (thresh) {
-               global_update_bandwidth(thresh, dirty, now);
-               bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
-                                          bdi_thresh, bdi_dirty,
-                                          dirtied, elapsed);
+       if (update_ratelimit) {
+               domain_update_bandwidth(gdtc, now);
+               wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+
+               /*
+                * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+                * compiler has no way to figure that out.  Help it.
+                */
+               if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+                       domain_update_bandwidth(mdtc, now);
+                       wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+               }
        }
-       bdi_update_write_bandwidth(bdi, elapsed, written);
+       wb_update_write_bandwidth(wb, elapsed, written);
 
 snapshot:
-       bdi->dirtied_stamp = dirtied;
-       bdi->written_stamp = written;
-       bdi->bw_time_stamp = now;
+       wb->dirtied_stamp = dirtied;
+       wb->written_stamp = written;
+       wb->bw_time_stamp = now;
 }
 
-static void bdi_update_bandwidth(struct backing_dev_info *bdi,
-                                unsigned long thresh,
-                                unsigned long bg_thresh,
-                                unsigned long dirty,
-                                unsigned long bdi_thresh,
-                                unsigned long bdi_dirty,
-                                unsigned long start_time)
+void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
 {
-       if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
-               return;
-       spin_lock(&bdi->wb.list_lock);
-       __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
-                              bdi_thresh, bdi_dirty, start_time);
-       spin_unlock(&bdi->wb.list_lock);
+       struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+
+       __wb_update_bandwidth(&gdtc, NULL, start_time, false);
 }
 
 /*
@@ -1187,10 +1366,10 @@ static unsigned long dirty_poll_interval(unsigned long dirty,
        return 1;
 }
 
-static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
-                                  unsigned long bdi_dirty)
+static unsigned long wb_max_pause(struct bdi_writeback *wb,
+                                 unsigned long wb_dirty)
 {
-       unsigned long bw = bdi->avg_write_bandwidth;
+       unsigned long bw = wb->avg_write_bandwidth;
        unsigned long t;
 
        /*
@@ -1200,20 +1379,20 @@ static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
         *
         * 8 serves as the safety ratio.
         */
-       t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+       t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
        t++;
 
        return min_t(unsigned long, t, MAX_PAUSE);
 }
 
-static long bdi_min_pause(struct backing_dev_info *bdi,
-                         long max_pause,
-                         unsigned long task_ratelimit,
-                         unsigned long dirty_ratelimit,
-                         int *nr_dirtied_pause)
+static long wb_min_pause(struct bdi_writeback *wb,
+                        long max_pause,
+                        unsigned long task_ratelimit,
+                        unsigned long dirty_ratelimit,
+                        int *nr_dirtied_pause)
 {
-       long hi = ilog2(bdi->avg_write_bandwidth);
-       long lo = ilog2(bdi->dirty_ratelimit);
+       long hi = ilog2(wb->avg_write_bandwidth);
+       long lo = ilog2(wb->dirty_ratelimit);
        long t;         /* target pause */
        long pause;     /* estimated next pause */
        int pages;      /* target nr_dirtied_pause */
@@ -1281,34 +1460,27 @@ static long bdi_min_pause(struct backing_dev_info *bdi,
        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
 }
 
-static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
-                                   unsigned long dirty_thresh,
-                                   unsigned long background_thresh,
-                                   unsigned long *bdi_dirty,
-                                   unsigned long *bdi_thresh,
-                                   unsigned long *bdi_bg_thresh)
+static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
 {
-       unsigned long bdi_reclaimable;
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long wb_reclaimable;
 
        /*
-        * bdi_thresh is not treated as some limiting factor as
+        * wb_thresh is not treated as some limiting factor as
         * dirty_thresh, due to reasons
-        * - in JBOD setup, bdi_thresh can fluctuate a lot
+        * - in JBOD setup, wb_thresh can fluctuate a lot
         * - in a system with HDD and USB key, the USB key may somehow
-        *   go into state (bdi_dirty >> bdi_thresh) either because
-        *   bdi_dirty starts high, or because bdi_thresh drops low.
+        *   go into state (wb_dirty >> wb_thresh) either because
+        *   wb_dirty starts high, or because wb_thresh drops low.
         *   In this case we don't want to hard throttle the USB key
-        *   dirtiers for 100 seconds until bdi_dirty drops under
-        *   bdi_thresh. Instead the auxiliary bdi control line in
-        *   bdi_position_ratio() will let the dirtier task progress
-        *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+        *   dirtiers for 100 seconds until wb_dirty drops under
+        *   wb_thresh. Instead the auxiliary wb control line in
+        *   wb_position_ratio() will let the dirtier task progress
+        *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
         */
-       *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-
-       if (bdi_bg_thresh)
-               *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
-                                                       background_thresh,
-                                                       dirty_thresh) : 0;
+       dtc->wb_thresh = __wb_calc_thresh(dtc);
+       dtc->wb_bg_thresh = dtc->thresh ?
+               div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
 
        /*
         * In order to avoid the stacked BDI deadlock we need
@@ -1320,14 +1492,12 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
         * actually dirty; with m+n sitting in the percpu
         * deltas.
         */
-       if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
-               bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-               *bdi_dirty = bdi_reclaimable +
-                       bdi_stat_sum(bdi, BDI_WRITEBACK);
+       if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+               wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+               dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
        } else {
-               bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-               *bdi_dirty = bdi_reclaimable +
-                       bdi_stat(bdi, BDI_WRITEBACK);
+               wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+               dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
        }
 }
 
@@ -1339,12 +1509,16 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
  * perform some writeout.
  */
 static void balance_dirty_pages(struct address_space *mapping,
+                               struct bdi_writeback *wb,
                                unsigned long pages_dirtied)
 {
+       struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+       struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+       struct dirty_throttle_control * const gdtc = &gdtc_stor;
+       struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+                                                    &mdtc_stor : NULL;
+       struct dirty_throttle_control *sdtc;
        unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
-       unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
-       unsigned long background_thresh;
-       unsigned long dirty_thresh;
        long period;
        long pause;
        long max_pause;
@@ -1353,18 +1527,14 @@ static void balance_dirty_pages(struct address_space *mapping,
        bool dirty_exceeded = false;
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
-       unsigned long pos_ratio;
-       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+       struct backing_dev_info *bdi = wb->bdi;
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
 
        for (;;) {
                unsigned long now = jiffies;
-               unsigned long uninitialized_var(bdi_thresh);
-               unsigned long thresh;
-               unsigned long uninitialized_var(bdi_dirty);
-               unsigned long dirty;
-               unsigned long bg_thresh;
+               unsigned long dirty, thresh, bg_thresh;
+               unsigned long m_dirty, m_thresh, m_bg_thresh;
 
                /*
                 * Unstable writes are a feature of certain networked
@@ -1374,65 +1544,127 @@ static void balance_dirty_pages(struct address_space *mapping,
                 */
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
-               nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+               gdtc->avail = global_dirtyable_memory();
+               gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
 
-               global_dirty_limits(&background_thresh, &dirty_thresh);
+               domain_dirty_limits(gdtc);
 
                if (unlikely(strictlimit)) {
-                       bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-                                        &bdi_dirty, &bdi_thresh, &bg_thresh);
+                       wb_dirty_limits(gdtc);
 
-                       dirty = bdi_dirty;
-                       thresh = bdi_thresh;
+                       dirty = gdtc->wb_dirty;
+                       thresh = gdtc->wb_thresh;
+                       bg_thresh = gdtc->wb_bg_thresh;
                } else {
-                       dirty = nr_dirty;
-                       thresh = dirty_thresh;
-                       bg_thresh = background_thresh;
+                       dirty = gdtc->dirty;
+                       thresh = gdtc->thresh;
+                       bg_thresh = gdtc->bg_thresh;
+               }
+
+               if (mdtc) {
+                       unsigned long writeback;
+
+                       /*
+                        * If @wb belongs to !root memcg, repeat the same
+                        * basic calculations for the memcg domain.
+                        */
+                       mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
+                                           &writeback);
+                       mdtc_cap_avail(mdtc);
+                       mdtc->dirty += writeback;
+
+                       domain_dirty_limits(mdtc);
+
+                       if (unlikely(strictlimit)) {
+                               wb_dirty_limits(mdtc);
+                               m_dirty = mdtc->wb_dirty;
+                               m_thresh = mdtc->wb_thresh;
+                               m_bg_thresh = mdtc->wb_bg_thresh;
+                       } else {
+                               m_dirty = mdtc->dirty;
+                               m_thresh = mdtc->thresh;
+                               m_bg_thresh = mdtc->bg_thresh;
+                       }
                }
 
                /*
                 * Throttle it only when the background writeback cannot
                 * catch-up. This avoids (excessively) small writeouts
-                * when the bdi limits are ramping up in case of !strictlimit.
+                * when the wb limits are ramping up in case of !strictlimit.
                 *
-                * In strictlimit case make decision based on the bdi counters
-                * and limits. Small writeouts when the bdi limits are ramping
+                * In strictlimit case make decision based on the wb counters
+                * and limits. Small writeouts when the wb limits are ramping
                 * up are the price we consciously pay for strictlimit-ing.
+                *
+                * If memcg domain is in effect, @dirty should be under
+                * both global and memcg freerun ceilings.
                 */
-               if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+               if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+                   (!mdtc ||
+                    m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+                       unsigned long intv = dirty_poll_interval(dirty, thresh);
+                       unsigned long m_intv = ULONG_MAX;
+
                        current->dirty_paused_when = now;
                        current->nr_dirtied = 0;
-                       current->nr_dirtied_pause =
-                               dirty_poll_interval(dirty, thresh);
+                       if (mdtc)
+                               m_intv = dirty_poll_interval(m_dirty, m_thresh);
+                       current->nr_dirtied_pause = min(intv, m_intv);
                        break;
                }
 
-               if (unlikely(!writeback_in_progress(bdi)))
-                       bdi_start_background_writeback(bdi);
+               if (unlikely(!writeback_in_progress(wb)))
+                       wb_start_background_writeback(wb);
 
+               /*
+                * Calculate global domain's pos_ratio and select the
+                * global dtc by default.
+                */
                if (!strictlimit)
-                       bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-                                        &bdi_dirty, &bdi_thresh, NULL);
-
-               dirty_exceeded = (bdi_dirty > bdi_thresh) &&
-                                ((nr_dirty > dirty_thresh) || strictlimit);
-               if (dirty_exceeded && !bdi->dirty_exceeded)
-                       bdi->dirty_exceeded = 1;
-
-               bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
-                                    nr_dirty, bdi_thresh, bdi_dirty,
-                                    start_time);
-
-               dirty_ratelimit = bdi->dirty_ratelimit;
-               pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
-                                              background_thresh, nr_dirty,
-                                              bdi_thresh, bdi_dirty);
-               task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+                       wb_dirty_limits(gdtc);
+
+               dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+                       ((gdtc->dirty > gdtc->thresh) || strictlimit);
+
+               wb_position_ratio(gdtc);
+               sdtc = gdtc;
+
+               if (mdtc) {
+                       /*
+                        * If memcg domain is in effect, calculate its
+                        * pos_ratio.  @wb should satisfy constraints from
+                        * both global and memcg domains.  Choose the one
+                        * w/ lower pos_ratio.
+                        */
+                       if (!strictlimit)
+                               wb_dirty_limits(mdtc);
+
+                       dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+                               ((mdtc->dirty > mdtc->thresh) || strictlimit);
+
+                       wb_position_ratio(mdtc);
+                       if (mdtc->pos_ratio < gdtc->pos_ratio)
+                               sdtc = mdtc;
+               }
+
+               if (dirty_exceeded && !wb->dirty_exceeded)
+                       wb->dirty_exceeded = 1;
+
+               if (time_is_before_jiffies(wb->bw_time_stamp +
+                                          BANDWIDTH_INTERVAL)) {
+                       spin_lock(&wb->list_lock);
+                       __wb_update_bandwidth(gdtc, mdtc, start_time, true);
+                       spin_unlock(&wb->list_lock);
+               }
+
+               /* throttle according to the chosen dtc */
+               dirty_ratelimit = wb->dirty_ratelimit;
+               task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
-               max_pause = bdi_max_pause(bdi, bdi_dirty);
-               min_pause = bdi_min_pause(bdi, max_pause,
-                                         task_ratelimit, dirty_ratelimit,
-                                         &nr_dirtied_pause);
+               max_pause = wb_max_pause(wb, sdtc->wb_dirty);
+               min_pause = wb_min_pause(wb, max_pause,
+                                        task_ratelimit, dirty_ratelimit,
+                                        &nr_dirtied_pause);
 
                if (unlikely(task_ratelimit == 0)) {
                        period = max_pause;
@@ -1452,11 +1684,11 @@ static void balance_dirty_pages(struct address_space *mapping,
                 */
                if (pause < min_pause) {
                        trace_balance_dirty_pages(bdi,
-                                                 dirty_thresh,
-                                                 background_thresh,
-                                                 nr_dirty,
-                                                 bdi_thresh,
-                                                 bdi_dirty,
+                                                 sdtc->thresh,
+                                                 sdtc->bg_thresh,
+                                                 sdtc->dirty,
+                                                 sdtc->wb_thresh,
+                                                 sdtc->wb_dirty,
                                                  dirty_ratelimit,
                                                  task_ratelimit,
                                                  pages_dirtied,
@@ -1481,11 +1713,11 @@ static void balance_dirty_pages(struct address_space *mapping,
 
 pause:
                trace_balance_dirty_pages(bdi,
-                                         dirty_thresh,
-                                         background_thresh,
-                                         nr_dirty,
-                                         bdi_thresh,
-                                         bdi_dirty,
+                                         sdtc->thresh,
+                                         sdtc->bg_thresh,
+                                         sdtc->dirty,
+                                         sdtc->wb_thresh,
+                                         sdtc->wb_dirty,
                                          dirty_ratelimit,
                                          task_ratelimit,
                                          pages_dirtied,
@@ -1500,33 +1732,33 @@ pause:
                current->nr_dirtied_pause = nr_dirtied_pause;
 
                /*
-                * This is typically equal to (nr_dirty < dirty_thresh) and can
-                * also keep "1000+ dd on a slow USB stick" under control.
+                * This is typically equal to (dirty < thresh) and can also
+                * keep "1000+ dd on a slow USB stick" under control.
                 */
                if (task_ratelimit)
                        break;
 
                /*
                 * In the case of an unresponding NFS server and the NFS dirty
-                * pages exceeds dirty_thresh, give the other good bdi's a pipe
+                * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
                 * In theory 1 page is enough to keep the comsumer-producer
                 * pipe going: the flusher cleans 1 page => the task dirties 1
-                * more page. However bdi_dirty has accounting errors.  So use
-                * the larger and more IO friendly bdi_stat_error.
+                * more page. However wb_dirty has accounting errors.  So use
+                * the larger and more IO friendly wb_stat_error.
                 */
-               if (bdi_dirty <= bdi_stat_error(bdi))
+               if (sdtc->wb_dirty <= wb_stat_error(wb))
                        break;
 
                if (fatal_signal_pending(current))
                        break;
        }
 
-       if (!dirty_exceeded && bdi->dirty_exceeded)
-               bdi->dirty_exceeded = 0;
+       if (!dirty_exceeded && wb->dirty_exceeded)
+               wb->dirty_exceeded = 0;
 
-       if (writeback_in_progress(bdi))
+       if (writeback_in_progress(wb))
                return;
 
        /*
@@ -1540,8 +1772,8 @@ pause:
        if (laptop_mode)
                return;
 
-       if (nr_reclaimable > background_thresh)
-               bdi_start_background_writeback(bdi);
+       if (nr_reclaimable > gdtc->bg_thresh)
+               wb_start_background_writeback(wb);
 }
 
 static DEFINE_PER_CPU(int, bdp_ratelimits);
@@ -1577,15 +1809,22 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
  */
 void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
-       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+       struct inode *inode = mapping->host;
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct bdi_writeback *wb = NULL;
        int ratelimit;
        int *p;
 
        if (!bdi_cap_account_dirty(bdi))
                return;
 
+       if (inode_cgwb_enabled(inode))
+               wb = wb_get_create_current(bdi, GFP_KERNEL);
+       if (!wb)
+               wb = &bdi->wb;
+
        ratelimit = current->nr_dirtied_pause;
-       if (bdi->dirty_exceeded)
+       if (wb->dirty_exceeded)
                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
 
        preempt_disable();
@@ -1617,10 +1856,59 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
        preempt_enable();
 
        if (unlikely(current->nr_dirtied >= ratelimit))
-               balance_dirty_pages(mapping, current->nr_dirtied);
+               balance_dirty_pages(mapping, wb, current->nr_dirtied);
+
+       wb_put(wb);
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
+/**
+ * wb_over_bg_thresh - does @wb need to be written back?
+ * @wb: bdi_writeback of interest
+ *
+ * Determines whether background writeback should keep writing @wb or it's
+ * clean enough.  Returns %true if writeback should continue.
+ */
+bool wb_over_bg_thresh(struct bdi_writeback *wb)
+{
+       struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+       struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+       struct dirty_throttle_control * const gdtc = &gdtc_stor;
+       struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+                                                    &mdtc_stor : NULL;
+
+       /*
+        * Similar to balance_dirty_pages() but ignores pages being written
+        * as we're trying to decide whether to put more under writeback.
+        */
+       gdtc->avail = global_dirtyable_memory();
+       gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
+                     global_page_state(NR_UNSTABLE_NFS);
+       domain_dirty_limits(gdtc);
+
+       if (gdtc->dirty > gdtc->bg_thresh)
+               return true;
+
+       if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+               return true;
+
+       if (mdtc) {
+               unsigned long writeback;
+
+               mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
+               mdtc_cap_avail(mdtc);
+               domain_dirty_limits(mdtc);      /* ditto, ignore writeback */
+
+               if (mdtc->dirty > mdtc->bg_thresh)
+                       return true;
+
+               if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+                       return true;
+       }
+
+       return false;
+}
+
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
        unsigned long background_thresh;
@@ -1628,7 +1916,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
 
         for ( ; ; ) {
                global_dirty_limits(&background_thresh, &dirty_thresh);
-               dirty_thresh = hard_dirty_limit(dirty_thresh);
+               dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
 
                 /*
                  * Boost the allowable dirty threshold a bit for page
@@ -1667,14 +1955,20 @@ void laptop_mode_timer_fn(unsigned long data)
        struct request_queue *q = (struct request_queue *)data;
        int nr_pages = global_page_state(NR_FILE_DIRTY) +
                global_page_state(NR_UNSTABLE_NFS);
+       struct bdi_writeback *wb;
+       struct wb_iter iter;
 
        /*
         * We want to write everything out, not just down to the dirty
         * threshold
         */
-       if (bdi_has_dirty_io(&q->backing_dev_info))
-               bdi_start_writeback(&q->backing_dev_info, nr_pages,
-                                       WB_REASON_LAPTOP_TIMER);
+       if (!bdi_has_dirty_io(&q->backing_dev_info))
+               return;
+
+       bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+               if (wb_has_dirty_io(wb))
+                       wb_start_writeback(wb, nr_pages, true,
+                                          WB_REASON_LAPTOP_TIMER);
 }
 
 /*
@@ -1718,10 +2012,12 @@ void laptop_sync_completion(void)
 
 void writeback_set_ratelimit(void)
 {
+       struct wb_domain *dom = &global_wb_domain;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
+
        global_dirty_limits(&background_thresh, &dirty_thresh);
-       global_dirty_limit = dirty_thresh;
+       dom->dirty_limit = dirty_thresh;
        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
@@ -1770,7 +2066,7 @@ void __init page_writeback_init(void)
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
 
-       fprop_global_init(&writeout_completions, GFP_KERNEL);
+       BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
 }
 
 /**
@@ -2090,19 +2386,29 @@ int __set_page_dirty_no_writeback(struct page *page)
 
 /*
  * Helper function for set_page_dirty family.
+ *
+ * Caller must hold mem_cgroup_begin_page_stat().
+ *
  * NOTE: This relies on being atomic wrt interrupts.
  */
-void account_page_dirtied(struct page *page, struct address_space *mapping)
+void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg)
 {
+       struct inode *inode = mapping->host;
+
        trace_writeback_dirty_page(page, mapping);
 
        if (mapping_cap_account_dirty(mapping)) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct bdi_writeback *wb;
 
+               inode_attach_wb(inode, page);
+               wb = inode_to_wb(inode);
+
+               mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
-               __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
-               __inc_bdi_stat(bdi, BDI_DIRTIED);
+               __inc_wb_stat(wb, WB_RECLAIMABLE);
+               __inc_wb_stat(wb, WB_DIRTIED);
                task_io_account_write(PAGE_CACHE_SIZE);
                current->nr_dirtied++;
                this_cpu_inc(bdp_ratelimits);
@@ -2113,21 +2419,18 @@ EXPORT_SYMBOL(account_page_dirtied);
 /*
  * Helper function for deaccounting dirty page without writeback.
  *
- * Doing this should *normally* only ever be done when a page
- * is truncated, and is not actually mapped anywhere at all. However,
- * fs/buffer.c does this when it notices that somebody has cleaned
- * out all the buffers on a page without actually doing it through
- * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+ * Caller must hold mem_cgroup_begin_page_stat().
  */
-void account_page_cleaned(struct page *page, struct address_space *mapping)
+void account_page_cleaned(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg, struct bdi_writeback *wb)
 {
        if (mapping_cap_account_dirty(mapping)) {
+               mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                dec_zone_page_state(page, NR_FILE_DIRTY);
-               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+               dec_wb_stat(wb, WB_RECLAIMABLE);
                task_io_account_cancelled_write(PAGE_CACHE_SIZE);
        }
 }
-EXPORT_SYMBOL(account_page_cleaned);
 
 /*
  * For address_spaces which do not use buffers.  Just tag the page as dirty in
@@ -2143,26 +2446,34 @@ EXPORT_SYMBOL(account_page_cleaned);
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_begin_page_stat(page);
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
                unsigned long flags;
 
-               if (!mapping)
+               if (!mapping) {
+                       mem_cgroup_end_page_stat(memcg);
                        return 1;
+               }
 
                spin_lock_irqsave(&mapping->tree_lock, flags);
                BUG_ON(page_mapping(page) != mapping);
                WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-               account_page_dirtied(page, mapping);
+               account_page_dirtied(page, mapping, memcg);
                radix_tree_tag_set(&mapping->page_tree, page_index(page),
                                   PAGECACHE_TAG_DIRTY);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
+
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
                }
                return 1;
        }
+       mem_cgroup_end_page_stat(memcg);
        return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@ -2177,10 +2488,17 @@ EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 void account_page_redirty(struct page *page)
 {
        struct address_space *mapping = page->mapping;
+
        if (mapping && mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               bool locked;
+
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
                current->nr_dirtied--;
                dec_zone_page_state(page, NR_DIRTIED);
-               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
+               dec_wb_stat(wb, WB_DIRTIED);
+               unlocked_inode_to_wb_end(inode, locked);
        }
 }
 EXPORT_SYMBOL(account_page_redirty);
@@ -2265,6 +2583,43 @@ int set_page_dirty_lock(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty_lock);
 
+/*
+ * This cancels just the dirty bit on the kernel page itself, it does NOT
+ * actually remove dirty bits on any mmap's that may be around. It also
+ * leaves the page tagged dirty, so any sync activity will still find it on
+ * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+ * look at the dirty bits in the VM.
+ *
+ * Doing this should *normally* only ever be done when a page is truncated,
+ * and is not actually mapped anywhere at all. However, fs/buffer.c does
+ * this when it notices that somebody has cleaned out all the buffers on a
+ * page without actually doing it through the VM. Can you say "ext3 is
+ * horribly ugly"? Thought you could.
+ */
+void cancel_dirty_page(struct page *page)
+{
+       struct address_space *mapping = page_mapping(page);
+
+       if (mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               struct mem_cgroup *memcg;
+               bool locked;
+
+               memcg = mem_cgroup_begin_page_stat(page);
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
+
+               if (TestClearPageDirty(page))
+                       account_page_cleaned(page, mapping, memcg, wb);
+
+               unlocked_inode_to_wb_end(inode, locked);
+               mem_cgroup_end_page_stat(memcg);
+       } else {
+               ClearPageDirty(page);
+       }
+}
+EXPORT_SYMBOL(cancel_dirty_page);
+
 /*
  * Clear a page's dirty flag, while caring for dirty memory accounting.
  * Returns true if the page was previously dirty.
@@ -2282,10 +2637,16 @@ EXPORT_SYMBOL(set_page_dirty_lock);
 int clear_page_dirty_for_io(struct page *page)
 {
        struct address_space *mapping = page_mapping(page);
+       int ret = 0;
 
        BUG_ON(!PageLocked(page));
 
        if (mapping && mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               struct mem_cgroup *memcg;
+               bool locked;
+
                /*
                 * Yes, Virginia, this is indeed insane.
                 *
@@ -2321,13 +2682,17 @@ int clear_page_dirty_for_io(struct page *page)
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
+               memcg = mem_cgroup_begin_page_stat(page);
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
                if (TestClearPageDirty(page)) {
+                       mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                       dec_bdi_stat(inode_to_bdi(mapping->host),
-                                       BDI_RECLAIMABLE);
-                       return 1;
+                       dec_wb_stat(wb, WB_RECLAIMABLE);
+                       ret = 1;
                }
-               return 0;
+               unlocked_inode_to_wb_end(inode, locked);
+               mem_cgroup_end_page_stat(memcg);
+               return ret;
        }
        return TestClearPageDirty(page);
 }
@@ -2341,7 +2706,8 @@ int test_clear_page_writeback(struct page *page)
 
        memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct inode *inode = mapping->host;
+               struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
 
                spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2351,8 +2717,10 @@ int test_clear_page_writeback(struct page *page)
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
                        if (bdi_cap_account_writeback(bdi)) {
-                               __dec_bdi_stat(bdi, BDI_WRITEBACK);
-                               __bdi_writeout_inc(bdi);
+                               struct bdi_writeback *wb = inode_to_wb(inode);
+
+                               __dec_wb_stat(wb, WB_WRITEBACK);
+                               __wb_writeout_inc(wb);
                        }
                }
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@ -2376,7 +2744,8 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 
        memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct inode *inode = mapping->host;
+               struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
 
                spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2386,7 +2755,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
                        if (bdi_cap_account_writeback(bdi))
-                               __inc_bdi_stat(bdi, BDI_WRITEBACK);
+                               __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
                }
                if (!PageDirty(page))
                        radix_tree_tag_clear(&mapping->page_tree,
index 935675844b2ee45fb29447c7098be1d4869f2c65..60cd846a9a4401f73a6a51539507ecc22fb308d4 100644 (file)
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
        /*
         * Defer asynchronous read-ahead on IO congestion.
         */
-       if (bdi_read_congested(inode_to_bdi(mapping->host)))
+       if (inode_read_congested(mapping->host))
                return;
 
        /* do read-ahead */
index 7af1ecb21ccb2d560ca9f0f21e002a26465737f4..171b68768df1478355bcddd5e30c2edd616ba05b 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -30,6 +30,8 @@
  *             swap_lock (in swap_duplicate, swap_info_get)
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mapping->private_lock (in __set_page_dirty_buffers)
+ *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ *                   mapping->tree_lock (widely used)
  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
  *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
index 66af9031fae8071c6d7a8c0b7e2a3da2c90684c7..76e35ad971025ce5eb3781543537d1bf3b947b8d 100644 (file)
@@ -116,9 +116,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
         * the VM has canceled the dirty bit (eg ext3 journaling).
         * Hence dirty accounting check is placed after invalidation.
         */
-       if (TestClearPageDirty(page))
-               account_page_cleaned(page, mapping);
-
+       cancel_dirty_page(page);
        ClearPageMappedToDisk(page);
        delete_from_page_cache(page);
        return 0;
@@ -512,19 +510,24 @@ EXPORT_SYMBOL(invalidate_mapping_pages);
 static int
 invalidate_complete_page2(struct address_space *mapping, struct page *page)
 {
+       struct mem_cgroup *memcg;
+       unsigned long flags;
+
        if (page->mapping != mapping)
                return 0;
 
        if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
                return 0;
 
-       spin_lock_irq(&mapping->tree_lock);
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
        if (PageDirty(page))
                goto failed;
 
        BUG_ON(page_has_private(page));
-       __delete_from_page_cache(page, NULL);
-       spin_unlock_irq(&mapping->tree_lock);
+       __delete_from_page_cache(page, NULL, memcg);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
 
        if (mapping->a_ops->freepage)
                mapping->a_ops->freepage(page);
@@ -532,7 +535,8 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
        page_cache_release(page);       /* pagecache ref */
        return 1;
 failed:
-       spin_unlock_irq(&mapping->tree_lock);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
        return 0;
 }
 
index 19ef01e90ac42077c3d7898d5ef0d149a166b1aa..e61445dce04e3cc83e9704e84f3d5bf9074b31db 100644 (file)
@@ -154,11 +154,42 @@ static bool global_reclaim(struct scan_control *sc)
 {
        return !sc->target_mem_cgroup;
 }
+
+/**
+ * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * @sc: scan_control in question
+ *
+ * The normal page dirty throttling mechanism in balance_dirty_pages() is
+ * completely broken with the legacy memcg and direct stalling in
+ * shrink_page_list() is used for throttling instead, which lacks all the
+ * niceties such as fairness, adaptive pausing, bandwidth proportional
+ * allocation and configurability.
+ *
+ * This function tests whether the vmscan currently in progress can assume
+ * that the normal dirty throttling mechanism is operational.
+ */
+static bool sane_reclaim(struct scan_control *sc)
+{
+       struct mem_cgroup *memcg = sc->target_mem_cgroup;
+
+       if (!memcg)
+               return true;
+#ifdef CONFIG_CGROUP_WRITEBACK
+       if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+               return true;
+#endif
+       return false;
+}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
        return true;
 }
+
+static bool sane_reclaim(struct scan_control *sc)
+{
+       return true;
+}
 #endif
 
 static unsigned long zone_reclaimable_pages(struct zone *zone)
@@ -452,14 +483,13 @@ static inline int is_page_cache_freeable(struct page *page)
        return page_count(page) - page_has_private(page) == 2;
 }
 
-static int may_write_to_queue(struct backing_dev_info *bdi,
-                             struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
 {
        if (current->flags & PF_SWAPWRITE)
                return 1;
-       if (!bdi_write_congested(bdi))
+       if (!inode_write_congested(inode))
                return 1;
-       if (bdi == current->backing_dev_info)
+       if (inode_to_bdi(inode) == current->backing_dev_info)
                return 1;
        return 0;
 }
@@ -538,7 +568,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-       if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+       if (!may_write_to_inode(mapping->host, sc))
                return PAGE_KEEP;
 
        if (clear_page_dirty_for_io(page)) {
@@ -579,10 +609,14 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed)
 {
+       unsigned long flags;
+       struct mem_cgroup *memcg;
+
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
 
-       spin_lock_irq(&mapping->tree_lock);
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
        /*
         * The non racy check for a busy page.
         *
@@ -620,7 +654,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                swp_entry_t swap = { .val = page_private(page) };
                mem_cgroup_swapout(page, swap);
                __delete_from_swap_cache(page);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                swapcache_free(swap);
        } else {
                void (*freepage)(struct page *);
@@ -640,8 +675,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                if (reclaimed && page_is_file_cache(page) &&
                    !mapping_exiting(mapping))
                        shadow = workingset_eviction(mapping, page);
-               __delete_from_page_cache(page, shadow);
-               spin_unlock_irq(&mapping->tree_lock);
+               __delete_from_page_cache(page, shadow, memcg);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
 
                if (freepage != NULL)
                        freepage(page);
@@ -650,7 +686,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
        return 1;
 
 cannot_free:
-       spin_unlock_irq(&mapping->tree_lock);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
        return 0;
 }
 
@@ -917,7 +954,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 */
                mapping = page_mapping(page);
                if (((dirty || writeback) && mapping &&
-                    bdi_write_congested(inode_to_bdi(mapping->host))) ||
+                    inode_write_congested(mapping->host)) ||
                    (writeback && PageReclaim(page)))
                        nr_congested++;
 
@@ -935,10 +972,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 *    note that the LRU is being scanned too quickly and the
                 *    caller can stall after page list has been processed.
                 *
-                * 2) Global reclaim encounters a page, memcg encounters a
-                *    page that is not marked for immediate reclaim or
-                *    the caller does not have __GFP_IO. In this case mark
-                *    the page for immediate reclaim and continue scanning.
+                * 2) Global or new memcg reclaim encounters a page that is
+                *    not marked for immediate reclaim or the caller does not
+                *    have __GFP_IO. In this case mark the page for immediate
+                *    reclaim and continue scanning.
                 *
                 *    __GFP_IO is checked  because a loop driver thread might
                 *    enter reclaim, and deadlock if it waits on a page for
@@ -952,7 +989,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
                 *    may_enter_fs here is liable to OOM on them.
                 *
-                * 3) memcg encounters a page that is not already marked
+                * 3) Legacy memcg encounters a page that is not already marked
                 *    PageReclaim. memcg does not have any dirty pages
                 *    throttling so we could easily OOM just because too many
                 *    pages are in writeback and there is nothing else to
@@ -967,7 +1004,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
 
                        /* Case 2 above */
-                       } else if (global_reclaim(sc) ||
+                       } else if (sane_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -1416,7 +1453,7 @@ static int too_many_isolated(struct zone *zone, int file,
        if (current_is_kswapd())
                return 0;
 
-       if (!global_reclaim(sc))
+       if (!sane_reclaim(sc))
                return 0;
 
        if (file) {
@@ -1608,10 +1645,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
                set_bit(ZONE_WRITEBACK, &zone->flags);
 
        /*
-        * memcg will stall in page writeback so only consider forcibly
-        * stalling for global reclaim
+        * Legacy memcg will stall in page writeback so avoid forcibly
+        * stalling here.
         */
-       if (global_reclaim(sc)) {
+       if (sane_reclaim(sc)) {
                /*
                 * Tag a zone as congested if all the dirty pages scanned were
                 * backed by a congested BDI and wait_iff_congested will stall.