Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
Pull cgroup writeback support from Jens Axboe:
 "This is the big pull request for adding cgroup writeback support.

  This code has been in development for a long time, and it has been
  simmering in for-next for a good chunk of this cycle too.  This is one
  of those problems that has been talked about for at least half a
  decade, finally there's a solution and code to go with it.

  Also see last weeks writeup on LWN:

        http://lwn.net/Articles/648292/"

* 'for-4.2/writeback' of git://git.kernel.dk/linux-block: (85 commits)
  writeback, blkio: add documentation for cgroup writeback support
  vfs, writeback: replace FS_CGROUP_WRITEBACK with SB_I_CGROUPWB
  writeback: do foreign inode detection iff cgroup writeback is enabled
  v9fs: fix error handling in v9fs_session_init()
  bdi: fix wrong error return value in cgwb_create()
  buffer: remove unusued 'ret' variable
  writeback: disassociate inodes from dying bdi_writebacks
  writeback: implement foreign cgroup inode bdi_writeback switching
  writeback: add lockdep annotation to inode_to_wb()
  writeback: use unlocked_inode_to_wb transaction in inode_congested()
  writeback: implement unlocked_inode_to_wb transaction and use it for stat updates
  writeback: implement [locked_]inode_to_wb_and_lock_list()
  writeback: implement foreign cgroup inode detection
  writeback: make writeback_control track the inode being written back
  writeback: relocate wb[_try]_get(), wb_put(), inode_{attach|detach}_wb()
  mm: vmscan: disable memcg direct reclaim stalling if cgroup writeback support is in use
  writeback: implement memcg writeback domain based throttling
  writeback: reset wb_domain->dirty_limit[_tstmp] when memcg domain size changes
  writeback: implement memcg wb_domain
  writeback: update wb_over_bg_thresh() to use wb_domain aware operations
  ...

32 files changed:
1  2 
block/blk-cgroup.c
block/blk-core.c
block/blk-sysfs.c
block/bounce.c
block/cfq-iosched.c
block/elevator.c
block/genhd.c
drivers/md/dm.c
drivers/md/raid10.c
fs/ext4/extents.c
fs/ext4/mballoc.c
fs/ext4/super.c
fs/f2fs/node.c
fs/f2fs/segment.h
fs/inode.c
fs/nfs/write.c
fs/ocfs2/file.c
fs/xfs/xfs_file.c
include/linux/backing-dev.h
include/linux/blk-cgroup.h
include/linux/blkdev.h
include/linux/fs.h
include/linux/memcontrol.h
include/linux/mm.h
include/trace/events/writeback.h
init/Kconfig
mm/backing-dev.c
mm/filemap.c
mm/memcontrol.c
mm/page-writeback.c
mm/rmap.c
mm/vmscan.c

diff --combined block/blk-cgroup.c
index 6e43fa355e7127e8e2b10ff33eee5c0ab43ccf90,31610ae0ebff2bcbd6b9d80da9f04e08bcc1b697..9f97da52d006281b1ab3e2911d85934216e3931a
@@@ -9,30 -9,30 +9,33 @@@
   *
   * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
   *                  Nauman Rafique <nauman@google.com>
 + *
 + * For policy-specific per-blkcg data:
 + * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
 + *                    Arianna Avanzini <avanzini.arianna@gmail.com>
   */
  #include <linux/ioprio.h>
  #include <linux/kdev_t.h>
  #include <linux/module.h>
  #include <linux/err.h>
  #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
  #include <linux/slab.h>
  #include <linux/genhd.h>
  #include <linux/delay.h>
  #include <linux/atomic.h>
- #include "blk-cgroup.h"
+ #include <linux/blk-cgroup.h>
  #include "blk.h"
  
  #define MAX_KEY_LEN 100
  
  static DEFINE_MUTEX(blkcg_pol_mutex);
  
 -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
 -                          .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
 +struct blkcg blkcg_root;
  EXPORT_SYMBOL_GPL(blkcg_root);
  
+ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
  static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
  
  static bool blkcg_policy_enabled(struct request_queue *q,
@@@ -182,6 -182,7 +185,7 @@@ static struct blkcg_gq *blkg_create(str
                                    struct blkcg_gq *new_blkg)
  {
        struct blkcg_gq *blkg;
+       struct bdi_writeback_congested *wb_congested;
        int i, ret;
  
        WARN_ON_ONCE(!rcu_read_lock_held());
                goto err_free_blkg;
        }
  
+       wb_congested = wb_congested_get_create(&q->backing_dev_info,
+                                              blkcg->css.id, GFP_ATOMIC);
+       if (!wb_congested) {
+               ret = -ENOMEM;
+               goto err_put_css;
+       }
        /* allocate */
        if (!new_blkg) {
                new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
                if (unlikely(!new_blkg)) {
                        ret = -ENOMEM;
-                       goto err_put_css;
+                       goto err_put_congested;
                }
        }
        blkg = new_blkg;
+       blkg->wb_congested = wb_congested;
  
        /* link parent */
        if (blkcg_parent(blkcg)) {
                blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                if (WARN_ON_ONCE(!blkg->parent)) {
                        ret = -EINVAL;
-                       goto err_put_css;
+                       goto err_put_congested;
                }
                blkg_get(blkg->parent);
        }
        blkg->online = true;
        spin_unlock(&blkcg->lock);
  
-       if (!ret) {
-               if (blkcg == &blkcg_root) {
-                       q->root_blkg = blkg;
-                       q->root_rl.blkg = blkg;
-               }
+       if (!ret)
                return blkg;
-       }
  
        /* @blkg failed fully initialized, use the usual release path */
        blkg_put(blkg);
        return ERR_PTR(ret);
  
+ err_put_congested:
+       wb_congested_put(wb_congested);
  err_put_css:
        css_put(&blkcg->css);
  err_free_blkg:
@@@ -342,15 -348,6 +351,6 @@@ static void blkg_destroy(struct blkcg_g
        if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                rcu_assign_pointer(blkcg->blkg_hint, NULL);
  
-       /*
-        * If root blkg is destroyed.  Just clear the pointer since root_rl
-        * does not take reference on root blkg.
-        */
-       if (blkcg == &blkcg_root) {
-               blkg->q->root_blkg = NULL;
-               blkg->q->root_rl.blkg = NULL;
-       }
        /*
         * Put the reference taken at the time of creation so that when all
         * queues are gone, group can be destroyed.
@@@ -405,6 -402,8 +405,8 @@@ void __blkg_release_rcu(struct rcu_hea
        if (blkg->parent)
                blkg_put(blkg->parent);
  
+       wb_congested_put(blkg->wb_congested);
        blkg_free(blkg);
  }
  EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@@ -812,6 -811,8 +814,8 @@@ static void blkcg_css_offline(struct cg
        }
  
        spin_unlock_irq(&blkcg->lock);
+       wb_blkcg_offline(blkcg);
  }
  
  static void blkcg_css_free(struct cgroup_subsys_state *css)
@@@ -826,8 -827,6 +830,8 @@@ static struct cgroup_subsys_state 
  blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
  {
        struct blkcg *blkcg;
 +      struct cgroup_subsys_state *ret;
 +      int i;
  
        if (!parent_css) {
                blkcg = &blkcg_root;
        }
  
        blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
 -      if (!blkcg)
 -              return ERR_PTR(-ENOMEM);
 +      if (!blkcg) {
 +              ret = ERR_PTR(-ENOMEM);
 +              goto free_blkcg;
 +      }
 +
 +      for (i = 0; i < BLKCG_MAX_POLS ; i++) {
 +              struct blkcg_policy *pol = blkcg_policy[i];
 +              struct blkcg_policy_data *cpd;
 +
 +              /*
 +               * If the policy hasn't been attached yet, wait for it
 +               * to be attached before doing anything else. Otherwise,
 +               * check if the policy requires any specific per-cgroup
 +               * data: if it does, allocate and initialize it.
 +               */
 +              if (!pol || !pol->cpd_size)
 +                      continue;
 +
 +              BUG_ON(blkcg->pd[i]);
 +              cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
 +              if (!cpd) {
 +                      ret = ERR_PTR(-ENOMEM);
 +                      goto free_pd_blkcg;
 +              }
 +              blkcg->pd[i] = cpd;
 +              cpd->plid = i;
 +              pol->cpd_init_fn(blkcg);
 +      }
  
 -      blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
 -      blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
  done:
        spin_lock_init(&blkcg->lock);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&blkcg->cgwb_list);
+ #endif
        return &blkcg->css;
 +
 +free_pd_blkcg:
 +      for (i--; i >= 0; i--)
 +              kfree(blkcg->pd[i]);
 +
 +free_blkcg:
 +      kfree(blkcg);
 +      return ret;
  }
  
  /**
   */
  int blkcg_init_queue(struct request_queue *q)
  {
-       might_sleep();
+       struct blkcg_gq *new_blkg, *blkg;
+       bool preloaded;
+       int ret;
+       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+       if (!new_blkg)
+               return -ENOMEM;
+       preloaded = !radix_tree_preload(GFP_KERNEL);
+       /*
+        * Make sure the root blkg exists and count the existing blkgs.  As
+        * @q is bypassing at this point, blkg_lookup_create() can't be
+        * used.  Open code insertion.
+        */
+       rcu_read_lock();
+       spin_lock_irq(q->queue_lock);
+       blkg = blkg_create(&blkcg_root, q, new_blkg);
+       spin_unlock_irq(q->queue_lock);
+       rcu_read_unlock();
+       if (preloaded)
+               radix_tree_preload_end();
+       if (IS_ERR(blkg)) {
+               kfree(new_blkg);
+               return PTR_ERR(blkg);
+       }
+       q->root_blkg = blkg;
+       q->root_rl.blkg = blkg;
  
-       return blk_throtl_init(q);
+       ret = blk_throtl_init(q);
+       if (ret) {
+               spin_lock_irq(q->queue_lock);
+               blkg_destroy_all(q);
+               spin_unlock_irq(q->queue_lock);
+       }
+       return ret;
  }
  
  /**
@@@ -995,57 -1000,20 +1037,26 @@@ int blkcg_activate_policy(struct reques
                          const struct blkcg_policy *pol)
  {
        LIST_HEAD(pds);
-       struct blkcg_gq *blkg, *new_blkg;
 +      LIST_HEAD(cpds);
 -      struct blkg_policy_data *pd, *n;
+       struct blkcg_gq *blkg;
 +      struct blkg_policy_data *pd, *nd;
 +      struct blkcg_policy_data *cpd, *cnd;
        int cnt = 0, ret;
-       bool preloaded;
  
        if (blkcg_policy_enabled(q, pol))
                return 0;
  
-       /* preallocations for root blkg */
-       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-       if (!new_blkg)
-               return -ENOMEM;
+       /* count and allocate policy_data for all existing blkgs */
        blk_queue_bypass_start(q);
-       preloaded = !radix_tree_preload(GFP_KERNEL);
-       /*
-        * Make sure the root blkg exists and count the existing blkgs.  As
-        * @q is bypassing at this point, blkg_lookup_create() can't be
-        * used.  Open code it.
-        */
        spin_lock_irq(q->queue_lock);
-       rcu_read_lock();
-       blkg = __blkg_lookup(&blkcg_root, q, false);
-       if (blkg)
-               blkg_free(new_blkg);
-       else
-               blkg = blkg_create(&blkcg_root, q, new_blkg);
-       rcu_read_unlock();
-       if (preloaded)
-               radix_tree_preload_end();
-       if (IS_ERR(blkg)) {
-               ret = PTR_ERR(blkg);
-               goto out_unlock;
-       }
        list_for_each_entry(blkg, &q->blkg_list, q_node)
                cnt++;
        spin_unlock_irq(q->queue_lock);
  
 +      /*
 +       * Allocate per-blkg and per-blkcg policy data
 +       * for all existing blkgs.
 +       */
        while (cnt--) {
                pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
                if (!pd) {
                        goto out_free;
                }
                list_add_tail(&pd->alloc_node, &pds);
 +
 +              if (!pol->cpd_size)
 +                      continue;
 +              cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
 +              if (!cpd) {
 +                      ret = -ENOMEM;
 +                      goto out_free;
 +              }
 +              list_add_tail(&cpd->alloc_node, &cpds);
        }
  
        /*
 -       * Install the allocated pds With @q bypassing, no new blkg
 +       * Install the allocated pds and cpds. With @q bypassing, no new blkg
         * should have been created while the queue lock was dropped.
         */
        spin_lock_irq(q->queue_lock);
  
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
 -              if (WARN_ON(list_empty(&pds))) {
 +              if (WARN_ON(list_empty(&pds)) ||
 +                  WARN_ON(pol->cpd_size && list_empty(&cpds))) {
                        /* umm... this shouldn't happen, just abort */
                        ret = -ENOMEM;
                        goto out_unlock;
                }
 +              cpd = list_first_entry(&cpds, struct blkcg_policy_data,
 +                                     alloc_node);
 +              list_del_init(&cpd->alloc_node);
                pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
                list_del_init(&pd->alloc_node);
  
                /* grab blkcg lock too while installing @pd on @blkg */
                spin_lock(&blkg->blkcg->lock);
  
 +              if (!pol->cpd_size)
 +                      goto no_cpd;
 +              if (!blkg->blkcg->pd[pol->plid]) {
 +                      /* Per-policy per-blkcg data */
 +                      blkg->blkcg->pd[pol->plid] = cpd;
 +                      cpd->plid = pol->plid;
 +                      pol->cpd_init_fn(blkg->blkcg);
 +              } else { /* must free it as it has already been extracted */
 +                      kfree(cpd);
 +              }
 +no_cpd:
                blkg->pd[pol->plid] = pd;
                pd->blkg = blkg;
                pd->plid = pol->plid;
@@@ -1111,10 -1055,8 +1122,10 @@@ out_unlock
        spin_unlock_irq(q->queue_lock);
  out_free:
        blk_queue_bypass_end(q);
 -      list_for_each_entry_safe(pd, n, &pds, alloc_node)
 +      list_for_each_entry_safe(pd, nd, &pds, alloc_node)
                kfree(pd);
 +      list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
 +              kfree(cpd);
        return ret;
  }
  EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@@ -1140,10 -1082,6 +1151,6 @@@ void blkcg_deactivate_policy(struct req
  
        __clear_bit(pol->plid, q->blkcg_pols);
  
-       /* if no policy is left, no need for blkgs - shoot them down */
-       if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
-               blkg_destroy_all(q);
        list_for_each_entry(blkg, &q->blkg_list, q_node) {
                /* grab blkcg lock too while removing @pd from @blkg */
                spin_lock(&blkg->blkcg->lock);
  
                kfree(blkg->pd[pol->plid]);
                blkg->pd[pol->plid] = NULL;
 +              kfree(blkg->blkcg->pd[pol->plid]);
 +              blkg->blkcg->pd[pol->plid] = NULL;
  
                spin_unlock(&blkg->blkcg->lock);
        }
diff --combined block/blk-core.c
index f6ab750060fe019f97d0ccfbca367b9e6cd3b426,a4a2dbe46fe30df73de26d8b7f11f485220a3ce7..688ae9482cb8eab438d3bbcaf6d61602a366cfc8
  #include <linux/delay.h>
  #include <linux/ratelimit.h>
  #include <linux/pm_runtime.h>
+ #include <linux/blk-cgroup.h>
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/block.h>
  
  #include "blk.h"
- #include "blk-cgroup.h"
  #include "blk-mq.h"
  
  EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@@ -63,6 -63,31 +63,31 @@@ struct kmem_cache *blk_requestq_cachep
   */
  static struct workqueue_struct *kblockd_workqueue;
  
+ static void blk_clear_congested(struct request_list *rl, int sync)
+ {
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       clear_wb_congested(rl->blkg->wb_congested, sync);
+ #else
+       /*
+        * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+        * flip its congestion state for events on other blkcgs.
+        */
+       if (rl == &rl->q->root_rl)
+               clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+ #endif
+ }
+ static void blk_set_congested(struct request_list *rl, int sync)
+ {
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       set_wb_congested(rl->blkg->wb_congested, sync);
+ #else
+       /* see blk_clear_congested() */
+       if (rl == &rl->q->root_rl)
+               set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+ #endif
+ }
  void blk_queue_congestion_threshold(struct request_queue *q)
  {
        int nr;
@@@ -554,8 -579,6 +579,8 @@@ void blk_cleanup_queue(struct request_q
                q->queue_lock = &q->__queue_lock;
        spin_unlock_irq(lock);
  
 +      bdi_destroy(&q->backing_dev_info);
 +
        /* @q is and will stay empty, shutdown and put */
        blk_put_queue(q);
  }
@@@ -623,8 -646,7 +648,7 @@@ struct request_queue *blk_alloc_queue_n
  
        q->backing_dev_info.ra_pages =
                        (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-       q->backing_dev_info.state = 0;
-       q->backing_dev_info.capabilities = 0;
+       q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
        q->backing_dev_info.name = "block";
        q->node = node_id;
  
@@@ -736,8 -758,6 +760,8 @@@ blk_init_queue_node(request_fn_proc *rf
  }
  EXPORT_SYMBOL(blk_init_queue_node);
  
 +static void blk_queue_bio(struct request_queue *q, struct bio *bio);
 +
  struct request_queue *
  blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
                         spinlock_t *lock)
@@@ -847,13 -867,8 +871,8 @@@ static void __freed_request(struct requ
  {
        struct request_queue *q = rl->q;
  
-       /*
-        * bdi isn't aware of blkcg yet.  As all async IOs end up root
-        * blkcg anyway, just use root blkcg state.
-        */
-       if (rl == &q->root_rl &&
-           rl->count[sync] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, sync);
+       if (rl->count[sync] < queue_congestion_off_threshold(q))
+               blk_clear_congested(rl, sync);
  
        if (rl->count[sync] + 1 <= q->nr_requests) {
                if (waitqueue_active(&rl->wait[sync]))
@@@ -886,25 -901,25 +905,25 @@@ static void freed_request(struct reques
  int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
  {
        struct request_list *rl;
+       int on_thresh, off_thresh;
  
        spin_lock_irq(q->queue_lock);
        q->nr_requests = nr;
        blk_queue_congestion_threshold(q);
+       on_thresh = queue_congestion_on_threshold(q);
+       off_thresh = queue_congestion_off_threshold(q);
  
-       /* congestion isn't cgroup aware and follows root blkcg for now */
-       rl = &q->root_rl;
-       if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_SYNC);
-       else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_SYNC);
+       blk_queue_for_each_rl(rl, q) {
+               if (rl->count[BLK_RW_SYNC] >= on_thresh)
+                       blk_set_congested(rl, BLK_RW_SYNC);
+               else if (rl->count[BLK_RW_SYNC] < off_thresh)
+                       blk_clear_congested(rl, BLK_RW_SYNC);
  
-       if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_ASYNC);
-       else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_ASYNC);
+               if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+                       blk_set_congested(rl, BLK_RW_ASYNC);
+               else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+                       blk_clear_congested(rl, BLK_RW_ASYNC);
  
-       blk_queue_for_each_rl(rl, q) {
                if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
                        blk_set_rl_full(rl, BLK_RW_SYNC);
                } else {
@@@ -1014,12 -1029,7 +1033,7 @@@ static struct request *__get_request(st
                                }
                        }
                }
-               /*
-                * bdi isn't aware of blkcg yet.  As all async IOs end up
-                * root blkcg anyway, just use root blkcg state.
-                */
-               if (rl == &q->root_rl)
-                       blk_set_queue_congested(q, is_sync);
+               blk_set_congested(rl, is_sync);
        }
  
        /*
@@@ -1591,7 -1601,7 +1605,7 @@@ void init_request_from_bio(struct reque
        blk_rq_bio_prep(req->q, req, bio);
  }
  
 -void blk_queue_bio(struct request_queue *q, struct bio *bio)
 +static void blk_queue_bio(struct request_queue *q, struct bio *bio)
  {
        const bool sync = !!(bio->bi_rw & REQ_SYNC);
        struct blk_plug *plug;
@@@ -1699,6 -1709,7 +1713,6 @@@ out_unlock
                spin_unlock_irq(q->queue_lock);
        }
  }
 -EXPORT_SYMBOL_GPL(blk_queue_bio);     /* for device mapper only */
  
  /*
   * If bio->bi_dev is a partition, remap the location
diff --combined block/blk-sysfs.c
index 2b8fd302f677a967d87994f8a7532aab8dfe6569,1b60941dc4c65c45e05450e3a162b341b33aeec3..6264b382d4d1ba8765dc3b22cead4fd9bf384d99
@@@ -6,11 -6,12 +6,12 @@@
  #include <linux/module.h>
  #include <linux/bio.h>
  #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
  #include <linux/blktrace_api.h>
  #include <linux/blk-mq.h>
+ #include <linux/blk-cgroup.h>
  
  #include "blk.h"
- #include "blk-cgroup.h"
  #include "blk-mq.h"
  
  struct queue_sysfs_entry {
@@@ -522,6 -523,8 +523,6 @@@ static void blk_release_queue(struct ko
  
        blk_trace_shutdown(q);
  
 -      bdi_destroy(&q->backing_dev_info);
 -
        ida_simple_remove(&blk_queue_ida, q->id);
        call_rcu(&q->rcu_head, blk_free_queue_rcu);
  }
diff --combined block/bounce.c
index 3ab0bce1c947ef9be81f09139aa73d9bd4b76ff5,072280b3dd138e7cecf555c914fbb99c0589d618..b17311227c12764f18760ee4ce71fa828f939f45
@@@ -13,6 -13,7 +13,7 @@@
  #include <linux/pagemap.h>
  #include <linux/mempool.h>
  #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
  #include <linux/init.h>
  #include <linux/hash.h>
  #include <linux/highmem.h>
@@@ -218,8 -219,8 +219,8 @@@ bounce
                if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
                        continue;
  
 -              inc_zone_page_state(to->bv_page, NR_BOUNCE);
                to->bv_page = mempool_alloc(pool, q->bounce_gfp);
 +              inc_zone_page_state(to->bv_page, NR_BOUNCE);
  
                if (rw == WRITE) {
                        char *vto, *vfrom;
diff --combined block/cfq-iosched.c
index d8ad45ccd8fa784a60dac66d91158eb0c9065b27,bc8f429307736988d036b3d50a95ca964cc6650a..c62bb2e650b8c741e64ead5c9f32b090cbf19730
@@@ -14,8 -14,8 +14,8 @@@
  #include <linux/rbtree.h>
  #include <linux/ioprio.h>
  #include <linux/blktrace_api.h>
+ #include <linux/blk-cgroup.h>
  #include "blk.h"
- #include "blk-cgroup.h"
  
  /*
   * tunables
@@@ -67,11 -67,6 +67,11 @@@ static struct kmem_cache *cfq_pool
  #define sample_valid(samples) ((samples) > 80)
  #define rb_entry_cfqg(node)   rb_entry((node), struct cfq_group, rb_node)
  
 +/* blkio-related constants */
 +#define CFQ_WEIGHT_MIN          10
 +#define CFQ_WEIGHT_MAX          1000
 +#define CFQ_WEIGHT_DEFAULT      500
 +
  struct cfq_ttime {
        unsigned long last_end_request;
  
@@@ -217,15 -212,6 +217,15 @@@ struct cfqg_stats 
  #endif        /* CONFIG_CFQ_GROUP_IOSCHED */
  };
  
 +/* Per-cgroup data */
 +struct cfq_group_data {
 +      /* must be the first member */
 +      struct blkcg_policy_data pd;
 +
 +      unsigned int weight;
 +      unsigned int leaf_weight;
 +};
 +
  /* This is per cgroup per device grouping structure */
  struct cfq_group {
        /* must be the first member */
@@@ -460,6 -446,16 +460,6 @@@ CFQ_CFQQ_FNS(deep)
  CFQ_CFQQ_FNS(wait_busy);
  #undef CFQ_CFQQ_FNS
  
 -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 -{
 -      return pd ? container_of(pd, struct cfq_group, pd) : NULL;
 -}
 -
 -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 -{
 -      return pd_to_blkg(&cfqg->pd);
 -}
 -
  #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
  
  /* cfqg stats flags */
@@@ -604,22 -600,6 +604,22 @@@ static inline void cfqg_stats_update_av
  
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
  
 +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
 +{
 +      return pd ? container_of(pd, struct cfq_group, pd) : NULL;
 +}
 +
 +static struct cfq_group_data
 +*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
 +{
 +      return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
 +}
 +
 +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
 +{
 +      return pd_to_blkg(&cfqg->pd);
 +}
 +
  static struct blkcg_policy blkcg_policy_cfq;
  
  static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
        return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
  }
  
 +static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
 +{
 +      return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
 +}
 +
  static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
  {
        struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@@ -1569,28 -1544,13 +1569,28 @@@ static void cfqg_stats_init(struct cfqg
  #endif
  }
  
 +static void cfq_cpd_init(const struct blkcg *blkcg)
 +{
 +      struct cfq_group_data *cgd =
 +              cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
 +
 +      if (blkcg == &blkcg_root) {
 +              cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
 +              cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
 +      } else {
 +              cgd->weight = CFQ_WEIGHT_DEFAULT;
 +              cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
 +      }
 +}
 +
  static void cfq_pd_init(struct blkcg_gq *blkg)
  {
        struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 +      struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
  
        cfq_init_cfqg_base(cfqg);
 -      cfqg->weight = blkg->blkcg->cfq_weight;
 -      cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
 +      cfqg->weight = cgd->weight;
 +      cfqg->leaf_weight = cgd->leaf_weight;
        cfqg_stats_init(&cfqg->stats);
        cfqg_stats_init(&cfqg->dead_stats);
  }
@@@ -1713,27 -1673,13 +1713,27 @@@ static int cfqg_print_leaf_weight_devic
  
  static int cfq_print_weight(struct seq_file *sf, void *v)
  {
 -      seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
 +      struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
 +      struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
 +      unsigned int val = 0;
 +
 +      if (cgd)
 +              val = cgd->weight;
 +
 +      seq_printf(sf, "%u\n", val);
        return 0;
  }
  
  static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
  {
 -      seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
 +      struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
 +      struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
 +      unsigned int val = 0;
 +
 +      if (cgd)
 +              val = cgd->leaf_weight;
 +
 +      seq_printf(sf, "%u\n", val);
        return 0;
  }
  
@@@ -1744,7 -1690,6 +1744,7 @@@ static ssize_t __cfqg_set_weight_device
        struct blkcg *blkcg = css_to_blkcg(of_css(of));
        struct blkg_conf_ctx ctx;
        struct cfq_group *cfqg;
 +      struct cfq_group_data *cfqgd;
        int ret;
  
        ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
  
        ret = -EINVAL;
        cfqg = blkg_to_cfqg(ctx.blkg);
 +      cfqgd = blkcg_to_cfqgd(blkcg);
 +      if (!cfqg || !cfqgd)
 +              goto err;
 +
        if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
                if (!is_leaf_weight) {
                        cfqg->dev_weight = ctx.v;
 -                      cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
 +                      cfqg->new_weight = ctx.v ?: cfqgd->weight;
                } else {
                        cfqg->dev_leaf_weight = ctx.v;
 -                      cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
 +                      cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
                }
                ret = 0;
        }
  
 +err:
        blkg_conf_finish(&ctx);
        return ret ?: nbytes;
  }
@@@ -1790,23 -1730,16 +1790,23 @@@ static int __cfq_set_weight(struct cgro
  {
        struct blkcg *blkcg = css_to_blkcg(css);
        struct blkcg_gq *blkg;
 +      struct cfq_group_data *cfqgd;
 +      int ret = 0;
  
        if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
                return -EINVAL;
  
        spin_lock_irq(&blkcg->lock);
 +      cfqgd = blkcg_to_cfqgd(blkcg);
 +      if (!cfqgd) {
 +              ret = -EINVAL;
 +              goto out;
 +      }
  
        if (!is_leaf_weight)
 -              blkcg->cfq_weight = val;
 +              cfqgd->weight = val;
        else
 -              blkcg->cfq_leaf_weight = val;
 +              cfqgd->leaf_weight = val;
  
        hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                struct cfq_group *cfqg = blkg_to_cfqg(blkg);
  
                if (!is_leaf_weight) {
                        if (!cfqg->dev_weight)
 -                              cfqg->new_weight = blkcg->cfq_weight;
 +                              cfqg->new_weight = cfqgd->weight;
                } else {
                        if (!cfqg->dev_leaf_weight)
 -                              cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
 +                              cfqg->new_leaf_weight = cfqgd->leaf_weight;
                }
        }
  
 +out:
        spin_unlock_irq(&blkcg->lock);
 -      return 0;
 +      return ret;
  }
  
  static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@@ -4545,18 -4477,6 +4545,18 @@@ out_free
        return ret;
  }
  
 +static void cfq_registered_queue(struct request_queue *q)
 +{
 +      struct elevator_queue *e = q->elevator;
 +      struct cfq_data *cfqd = e->elevator_data;
 +
 +      /*
 +       * Default to IOPS mode with no idling for SSDs
 +       */
 +      if (blk_queue_nonrot(q))
 +              cfqd->cfq_slice_idle = 0;
 +}
 +
  /*
   * sysfs parts below -->
   */
@@@ -4672,7 -4592,6 +4672,7 @@@ static struct elevator_type iosched_cf
                .elevator_may_queue_fn =        cfq_may_queue,
                .elevator_init_fn =             cfq_init_queue,
                .elevator_exit_fn =             cfq_exit_queue,
 +              .elevator_registered_fn =       cfq_registered_queue,
        },
        .icq_size       =       sizeof(struct cfq_io_cq),
        .icq_align      =       __alignof__(struct cfq_io_cq),
  #ifdef CONFIG_CFQ_GROUP_IOSCHED
  static struct blkcg_policy blkcg_policy_cfq = {
        .pd_size                = sizeof(struct cfq_group),
 +      .cpd_size               = sizeof(struct cfq_group_data),
        .cftypes                = cfq_blkcg_files,
  
 +      .cpd_init_fn            = cfq_cpd_init,
        .pd_init_fn             = cfq_pd_init,
        .pd_offline_fn          = cfq_pd_offline,
        .pd_reset_stats_fn      = cfq_pd_reset_stats,
diff --combined block/elevator.c
index 942579d04128b5484f2d3e53bf38b4994ef852ee,3bbb48f430e40e4fc022d13e1f469e09377f662e..84d63943f2de2f386ff35e6a395f68ada173b5b2
  #include <linux/hash.h>
  #include <linux/uaccess.h>
  #include <linux/pm_runtime.h>
+ #include <linux/blk-cgroup.h>
  
  #include <trace/events/block.h>
  
  #include "blk.h"
- #include "blk-cgroup.h"
  
  static DEFINE_SPINLOCK(elv_list_lock);
  static LIST_HEAD(elv_list);
@@@ -157,7 -157,7 +157,7 @@@ struct elevator_queue *elevator_alloc(s
  
        eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
        if (unlikely(!eq))
 -              goto err;
 +              return NULL;
  
        eq->type = e;
        kobject_init(&eq->kobj, &elv_ktype);
        hash_init(eq->hash);
  
        return eq;
 -err:
 -      kfree(eq);
 -      elevator_put(e);
 -      return NULL;
  }
  EXPORT_SYMBOL(elevator_alloc);
  
@@@ -806,8 -810,6 +806,8 @@@ int elv_register_queue(struct request_q
                }
                kobject_uevent(&e->kobj, KOBJ_ADD);
                e->registered = 1;
 +              if (e->type->ops.elevator_registered_fn)
 +                      e->type->ops.elevator_registered_fn(q);
        }
        return error;
  }
diff --combined block/genhd.c
index ea982eadaf6380b974d6b1d39a7197085217ac91,d46ba566d62faeffc2b95089c0e684699c9257e4..59a1395eedac45e3e5d6326ed2956caf8a7de7c8
@@@ -8,6 -8,7 +8,7 @@@
  #include <linux/kdev_t.h>
  #include <linux/kernel.h>
  #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
  #include <linux/init.h>
  #include <linux/spinlock.h>
  #include <linux/proc_fs.h>
@@@ -422,9 -423,9 +423,9 @@@ int blk_alloc_devt(struct hd_struct *pa
        /* allocate ext devt */
        idr_preload(GFP_KERNEL);
  
 -      spin_lock(&ext_devt_lock);
 +      spin_lock_bh(&ext_devt_lock);
        idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
 -      spin_unlock(&ext_devt_lock);
 +      spin_unlock_bh(&ext_devt_lock);
  
        idr_preload_end();
        if (idx < 0)
@@@ -449,9 -450,9 +450,9 @@@ void blk_free_devt(dev_t devt
                return;
  
        if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
 -              spin_lock(&ext_devt_lock);
 +              spin_lock_bh(&ext_devt_lock);
                idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
 -              spin_unlock(&ext_devt_lock);
 +              spin_unlock_bh(&ext_devt_lock);
        }
  }
  
@@@ -653,6 -654,7 +654,6 @@@ void del_gendisk(struct gendisk *disk
        disk->flags &= ~GENHD_FL_UP;
  
        sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
 -      bdi_unregister(&disk->queue->backing_dev_info);
        blk_unregister_queue(disk);
        blk_unregister_region(disk_devt(disk), disk->minors);
  
@@@ -690,13 -692,13 +691,13 @@@ struct gendisk *get_gendisk(dev_t devt
        } else {
                struct hd_struct *part;
  
 -              spin_lock(&ext_devt_lock);
 +              spin_lock_bh(&ext_devt_lock);
                part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
                if (part && get_disk(part_to_disk(part))) {
                        *partno = part->partno;
                        disk = part_to_disk(part);
                }
 -              spin_unlock(&ext_devt_lock);
 +              spin_unlock_bh(&ext_devt_lock);
        }
  
        return disk;
diff --combined drivers/md/dm.c
index 4d6f089a0e9e2eca5b8fa58017a29e1da598c2a0,2161ed9329c41e95969415a23ec76af87a3476b1..d72829922eb6c8a2c81f2892f7c265a5bb0d9f24
@@@ -1031,11 -1031,13 +1031,11 @@@ static void rq_completed(struct mapped_
        dm_put(md);
  }
  
 -static void free_rq_clone(struct request *clone, bool must_be_mapped)
 +static void free_rq_clone(struct request *clone)
  {
        struct dm_rq_target_io *tio = clone->end_io_data;
        struct mapped_device *md = tio->md;
  
 -      WARN_ON_ONCE(must_be_mapped && !clone->q);
 -
        if (md->type == DM_TYPE_MQ_REQUEST_BASED)
                /* stacked on blk-mq queue(s) */
                tio->ti->type->release_clone_rq(clone);
@@@ -1077,7 -1079,7 +1077,7 @@@ static void dm_end_request(struct reque
                        rq->sense_len = clone->sense_len;
        }
  
 -      free_rq_clone(clone, true);
 +      free_rq_clone(clone);
        if (!rq->q->mq_ops)
                blk_end_request_all(rq, error);
        else
@@@ -1096,7 -1098,7 +1096,7 @@@ static void dm_unprep_request(struct re
        }
  
        if (clone)
 -              free_rq_clone(clone, false);
 +              free_rq_clone(clone);
  }
  
  /*
@@@ -1109,7 -1111,6 +1109,7 @@@ static void old_requeue_request(struct 
  
        spin_lock_irqsave(q->queue_lock, flags);
        blk_requeue_request(q, rq);
 +      blk_run_queue_async(q);
        spin_unlock_irqrestore(q->queue_lock, flags);
  }
  
@@@ -1670,7 -1671,8 +1670,7 @@@ static int dm_merge_bvec(struct request
        struct mapped_device *md = q->queuedata;
        struct dm_table *map = dm_get_live_table_fast(md);
        struct dm_target *ti;
 -      sector_t max_sectors;
 -      int max_size = 0;
 +      sector_t max_sectors, max_size = 0;
  
        if (unlikely(!map))
                goto out;
        max_sectors = min(max_io_len(bvm->bi_sector, ti),
                          (sector_t) queue_max_sectors(q));
        max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
 -      if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
 -              max_size = 0;
 +
 +      /*
 +       * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
 +       * to the targets' merge function since it holds sectors not bytes).
 +       * Just doing this as an interim fix for stable@ because the more
 +       * comprehensive cleanup of switching to sector_t will impact every
 +       * DM target that implements a ->merge hook.
 +       */
 +      if (max_size > INT_MAX)
 +              max_size = INT_MAX;
  
        /*
         * merge_bvec_fn() returns number of bytes
         * max is precomputed maximal io size
         */
        if (max_size && ti->type->merge)
 -              max_size = ti->type->merge(ti, bvm, biovec, max_size);
 +              max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
        /*
         * If the target doesn't support merge method and some of the devices
         * provided their merge_bvec method (we know this by looking for the
@@@ -1893,8 -1887,8 +1893,8 @@@ static int map_request(struct dm_rq_tar
                        dm_kill_unmapped_request(rq, r);
                        return r;
                }
 -              if (IS_ERR(clone))
 -                      return DM_MAPIO_REQUEUE;
 +              if (r != DM_MAPIO_REMAPPED)
 +                      return r;
                setup_clone(clone, rq, tio);
        }
  
@@@ -2080,7 -2074,7 +2080,7 @@@ static int dm_any_congested(void *conge
                         * the query about congestion status of request_queue
                         */
                        if (dm_request_based(md))
-                               r = md->queue->backing_dev_info.state &
+                               r = md->queue->backing_dev_info.wb.state &
                                    bdi_bits;
                        else
                                r = dm_table_any_congested(map, bdi_bits);
@@@ -2669,15 -2663,13 +2669,15 @@@ static int dm_mq_queue_rq(struct blk_mq
        if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
                /* clone request is allocated at the end of the pdu */
                tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
 -              if (!clone_rq(rq, md, tio, GFP_ATOMIC))
 -                      return BLK_MQ_RQ_QUEUE_BUSY;
 +              (void) clone_rq(rq, md, tio, GFP_ATOMIC);
                queue_kthread_work(&md->kworker, &tio->work);
        } else {
                /* Direct call is fine since .queue_rq allows allocations */
 -              if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
 -                      dm_requeue_unmapped_original_request(md, rq);
 +              if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
 +                      /* Undo dm_start_request() before requeuing */
 +                      rq_completed(md, rq_data_dir(rq), false);
 +                      return BLK_MQ_RQ_QUEUE_BUSY;
 +              }
        }
  
        return BLK_MQ_RQ_QUEUE_OK;
diff --combined drivers/md/raid10.c
index f55c3f35b7463141086afb727785c775c5185d76,fca825718f29a2ba6d5d39789ba6593699b3aec9..188d8e9a6bdcc39e4da54095466f45683d6b2177
@@@ -914,7 -914,7 +914,7 @@@ static int raid10_congested(struct mdde
        struct r10conf *conf = mddev->private;
        int i, ret = 0;
  
-       if ((bits & (1 << BDI_async_congested)) &&
+       if ((bits & (1 << WB_async_congested)) &&
            conf->pending_count >= max_queued_requests)
                return 1;
  
@@@ -4156,7 -4156,6 +4156,7 @@@ static int raid10_start_reshape(struct 
  
        clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
        clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 +      clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
        set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
        set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
  
diff --combined fs/ext4/extents.c
index d86d2622f82631f3fa46abe1047caf319b968402,e8b5866ffa07f838c82778757fee69fc81bc097f..aadb7282883493597f8dae099f20c3e86694bea8
@@@ -39,6 -39,7 +39,7 @@@
  #include <linux/slab.h>
  #include <asm/uaccess.h>
  #include <linux/fiemap.h>
+ #include <linux/backing-dev.h>
  #include "ext4_jbd2.h"
  #include "ext4_extents.h"
  #include "xattr.h"
@@@ -377,7 -378,7 +378,7 @@@ static int ext4_valid_extent(struct ino
        ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
        ext4_lblk_t last = lblock + len - 1;
  
 -      if (lblock > last)
 +      if (len == 0 || lblock > last)
                return 0;
        return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
  }
@@@ -4456,8 -4457,6 +4457,8 @@@ int ext4_ext_map_blocks(handle_t *handl
                ar.flags |= EXT4_MB_HINT_NOPREALLOC;
        if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                ar.flags |= EXT4_MB_DELALLOC_RESERVED;
 +      if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
 +              ar.flags |= EXT4_MB_USE_RESERVED;
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
@@@ -4665,7 -4664,6 +4666,7 @@@ static int ext4_alloc_file_blocks(struc
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
 +      int depth = 0;
        struct ext4_map_blocks map;
        unsigned int credits;
        loff_t epos;
        if (len <= EXT_UNWRITTEN_MAX_LEN)
                flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
  
 +      /* Wait all existing dio workers, newcomers will block on i_mutex */
 +      ext4_inode_block_unlocked_dio(inode);
 +      inode_dio_wait(inode);
 +
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, len);
 +      /*
 +       * We can only call ext_depth() on extent based inodes
 +       */
 +      if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 +              depth = ext_depth(inode);
 +      else
 +              depth = -1;
  
  retry:
        while (ret >= 0 && len) {
 +              /*
 +               * Recalculate credits when extent tree depth changes.
 +               */
 +              if (depth >= 0 && depth != ext_depth(inode)) {
 +                      credits = ext4_chunk_trans_blocks(inode, len);
 +                      depth = ext_depth(inode);
 +              }
 +
                handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                            credits);
                if (IS_ERR(handle)) {
                goto retry;
        }
  
 +      ext4_inode_resume_unlocked_dio(inode);
 +
        return ret > 0 ? ret2 : ret;
  }
  
@@@ -4936,14 -4913,12 +4937,14 @@@ long ext4_fallocate(struct file *file, 
         * bug we should fix....
         */
        if (ext4_encrypted_inode(inode) &&
 -          (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
 +          (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
 +                   FALLOC_FL_ZERO_RANGE)))
                return -EOPNOTSUPP;
  
        /* Return error if mode is not supported */
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
 -                   FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
 +                   FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
 +                   FALLOC_FL_INSERT_RANGE))
                return -EOPNOTSUPP;
  
        if (mode & FALLOC_FL_PUNCH_HOLE)
        if (mode & FALLOC_FL_COLLAPSE_RANGE)
                return ext4_collapse_range(inode, offset, len);
  
 +      if (mode & FALLOC_FL_INSERT_RANGE)
 +              return ext4_insert_range(inode, offset, len);
 +
        if (mode & FALLOC_FL_ZERO_RANGE)
                return ext4_zero_range(file, offset, len, mode);
  
@@@ -5253,13 -5225,13 +5254,13 @@@ ext4_access_path(handle_t *handle, stru
  /*
   * ext4_ext_shift_path_extents:
   * Shift the extents of a path structure lying between path[depth].p_ext
 - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
 - * from starting block for each extent.
 + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
 + * if it is right shift or left shift operation.
   */
  static int
  ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
                            struct inode *inode, handle_t *handle,
 -                          ext4_lblk_t *start)
 +                          enum SHIFT_DIRECTION SHIFT)
  {
        int depth, err = 0;
        struct ext4_extent *ex_start, *ex_last;
                        if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
                                update = 1;
  
 -                      *start = le32_to_cpu(ex_last->ee_block) +
 -                              ext4_ext_get_actual_len(ex_last);
 -
                        while (ex_start <= ex_last) {
 -                              le32_add_cpu(&ex_start->ee_block, -shift);
 -                              /* Try to merge to the left. */
 -                              if ((ex_start >
 -                                   EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
 -                                  ext4_ext_try_to_merge_right(inode,
 -                                                      path, ex_start - 1))
 +                              if (SHIFT == SHIFT_LEFT) {
 +                                      le32_add_cpu(&ex_start->ee_block,
 +                                              -shift);
 +                                      /* Try to merge to the left. */
 +                                      if ((ex_start >
 +                                          EXT_FIRST_EXTENT(path[depth].p_hdr))
 +                                          &&
 +                                          ext4_ext_try_to_merge_right(inode,
 +                                          path, ex_start - 1))
 +                                              ex_last--;
 +                                      else
 +                                              ex_start++;
 +                              } else {
 +                                      le32_add_cpu(&ex_last->ee_block, shift);
 +                                      ext4_ext_try_to_merge_right(inode, path,
 +                                              ex_last);
                                        ex_last--;
 -                              else
 -                                      ex_start++;
 +                              }
                        }
                        err = ext4_ext_dirty(handle, inode, path + depth);
                        if (err)
                if (err)
                        goto out;
  
 -              le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
 +              if (SHIFT == SHIFT_LEFT)
 +                      le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
 +              else
 +                      le32_add_cpu(&path[depth].p_idx->ei_block, shift);
                err = ext4_ext_dirty(handle, inode, path + depth);
                if (err)
                        goto out;
  
  /*
   * ext4_ext_shift_extents:
 - * All the extents which lies in the range from start to the last allocated
 - * block for the file are shifted downwards by shift blocks.
 + * All the extents which lies in the range from @start to the last allocated
 + * block for the @inode are shifted either towards left or right (depending
 + * upon @SHIFT) by @shift blocks.
   * On success, 0 is returned, error otherwise.
   */
  static int
  ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 -                     ext4_lblk_t start, ext4_lblk_t shift)
 +                     ext4_lblk_t start, ext4_lblk_t shift,
 +                     enum SHIFT_DIRECTION SHIFT)
  {
        struct ext4_ext_path *path;
        int ret = 0, depth;
        struct ext4_extent *extent;
 -      ext4_lblk_t stop_block;
 -      ext4_lblk_t ex_start, ex_end;
 +      ext4_lblk_t stop, *iterator, ex_start, ex_end;
  
        /* Let path point to the last extent */
        path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
        if (!extent)
                goto out;
  
 -      stop_block = le32_to_cpu(extent->ee_block) +
 +      stop = le32_to_cpu(extent->ee_block) +
                        ext4_ext_get_actual_len(extent);
  
 -      /* Nothing to shift, if hole is at the end of file */
 -      if (start >= stop_block)
 -              goto out;
 +       /*
 +       * In case of left shift, Don't start shifting extents until we make
 +       * sure the hole is big enough to accommodate the shift.
 +      */
 +      if (SHIFT == SHIFT_LEFT) {
 +              path = ext4_find_extent(inode, start - 1, &path, 0);
 +              if (IS_ERR(path))
 +                      return PTR_ERR(path);
 +              depth = path->p_depth;
 +              extent =  path[depth].p_ext;
 +              if (extent) {
 +                      ex_start = le32_to_cpu(extent->ee_block);
 +                      ex_end = le32_to_cpu(extent->ee_block) +
 +                              ext4_ext_get_actual_len(extent);
 +              } else {
 +                      ex_start = 0;
 +                      ex_end = 0;
 +              }
  
 -      /*
 -       * Don't start shifting extents until we make sure the hole is big
 -       * enough to accomodate the shift.
 -       */
 -      path = ext4_find_extent(inode, start - 1, &path, 0);
 -      if (IS_ERR(path))
 -              return PTR_ERR(path);
 -      depth = path->p_depth;
 -      extent =  path[depth].p_ext;
 -      if (extent) {
 -              ex_start = le32_to_cpu(extent->ee_block);
 -              ex_end = le32_to_cpu(extent->ee_block) +
 -                      ext4_ext_get_actual_len(extent);
 -      } else {
 -              ex_start = 0;
 -              ex_end = 0;
 +              if ((start == ex_start && shift > ex_start) ||
 +                  (shift > start - ex_end)) {
 +                      ext4_ext_drop_refs(path);
 +                      kfree(path);
 +                      return -EINVAL;
 +              }
        }
  
 -      if ((start == ex_start && shift > ex_start) ||
 -          (shift > start - ex_end))
 -              return -EINVAL;
 +      /*
 +       * In case of left shift, iterator points to start and it is increased
 +       * till we reach stop. In case of right shift, iterator points to stop
 +       * and it is decreased till we reach start.
 +       */
 +      if (SHIFT == SHIFT_LEFT)
 +              iterator = &start;
 +      else
 +              iterator = &stop;
  
        /* Its safe to start updating extents */
 -      while (start < stop_block) {
 -              path = ext4_find_extent(inode, start, &path, 0);
 +      while (start < stop) {
 +              path = ext4_find_extent(inode, *iterator, &path, 0);
                if (IS_ERR(path))
                        return PTR_ERR(path);
                depth = path->p_depth;
                extent = path[depth].p_ext;
                if (!extent) {
                        EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
 -                                       (unsigned long) start);
 +                                       (unsigned long) *iterator);
                        return -EIO;
                }
 -              if (start > le32_to_cpu(extent->ee_block)) {
 +              if (SHIFT == SHIFT_LEFT && *iterator >
 +                  le32_to_cpu(extent->ee_block)) {
                        /* Hole, move to the next extent */
                        if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
                                path[depth].p_ext++;
                        } else {
 -                              start = ext4_ext_next_allocated_block(path);
 +                              *iterator = ext4_ext_next_allocated_block(path);
                                continue;
                        }
                }
 +
 +              if (SHIFT == SHIFT_LEFT) {
 +                      extent = EXT_LAST_EXTENT(path[depth].p_hdr);
 +                      *iterator = le32_to_cpu(extent->ee_block) +
 +                                      ext4_ext_get_actual_len(extent);
 +              } else {
 +                      extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
 +                      *iterator =  le32_to_cpu(extent->ee_block) > 0 ?
 +                              le32_to_cpu(extent->ee_block) - 1 : 0;
 +                      /* Update path extent in case we need to stop */
 +                      while (le32_to_cpu(extent->ee_block) < start)
 +                              extent++;
 +                      path[depth].p_ext = extent;
 +              }
                ret = ext4_ext_shift_path_extents(path, shift, inode,
 -                              handle, &start);
 +                              handle, SHIFT);
                if (ret)
                        break;
        }
@@@ -5461,14 -5397,6 +5462,14 @@@ int ext4_collapse_range(struct inode *i
        loff_t new_size, ioffset;
        int ret;
  
 +      /*
 +       * We need to test this early because xfstests assumes that a
 +       * collapse range of (0, 1) will return EOPNOTSUPP if the file
 +       * system does not support collapse range.
 +       */
 +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 +              return -EOPNOTSUPP;
 +
        /* Collapse range works only on fs block size aligned offsets. */
        if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
            len & (EXT4_CLUSTER_SIZE(sb) - 1))
        ext4_discard_preallocations(inode);
  
        ret = ext4_ext_shift_extents(inode, handle, punch_stop,
 -                                   punch_stop - punch_start);
 +                                   punch_stop - punch_start, SHIFT_LEFT);
        if (ret) {
                up_write(&EXT4_I(inode)->i_data_sem);
                goto out_stop;
@@@ -5575,174 -5503,6 +5576,174 @@@ out_mutex
        return ret;
  }
  
 +/*
 + * ext4_insert_range:
 + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
 + * The data blocks starting from @offset to the EOF are shifted by @len
 + * towards right to create a hole in the @inode. Inode size is increased
 + * by len bytes.
 + * Returns 0 on success, error otherwise.
 + */
 +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 +{
 +      struct super_block *sb = inode->i_sb;
 +      handle_t *handle;
 +      struct ext4_ext_path *path;
 +      struct ext4_extent *extent;
 +      ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
 +      unsigned int credits, ee_len;
 +      int ret = 0, depth, split_flag = 0;
 +      loff_t ioffset;
 +
 +      /*
 +       * We need to test this early because xfstests assumes that an
 +       * insert range of (0, 1) will return EOPNOTSUPP if the file
 +       * system does not support insert range.
 +       */
 +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 +              return -EOPNOTSUPP;
 +
 +      /* Insert range works only on fs block size aligned offsets. */
 +      if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
 +                      len & (EXT4_CLUSTER_SIZE(sb) - 1))
 +              return -EINVAL;
 +
 +      if (!S_ISREG(inode->i_mode))
 +              return -EOPNOTSUPP;
 +
 +      trace_ext4_insert_range(inode, offset, len);
 +
 +      offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
 +      len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
 +
 +      /* Call ext4_force_commit to flush all data in case of data=journal */
 +      if (ext4_should_journal_data(inode)) {
 +              ret = ext4_force_commit(inode->i_sb);
 +              if (ret)
 +                      return ret;
 +      }
 +
 +      /*
 +       * Need to round down to align start offset to page size boundary
 +       * for page size > block size.
 +       */
 +      ioffset = round_down(offset, PAGE_SIZE);
 +
 +      /* Write out all dirty pages */
 +      ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
 +                      LLONG_MAX);
 +      if (ret)
 +              return ret;
 +
 +      /* Take mutex lock */
 +      mutex_lock(&inode->i_mutex);
 +
 +      /* Currently just for extent based files */
 +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 +              ret = -EOPNOTSUPP;
 +              goto out_mutex;
 +      }
 +
 +      /* Check for wrap through zero */
 +      if (inode->i_size + len > inode->i_sb->s_maxbytes) {
 +              ret = -EFBIG;
 +              goto out_mutex;
 +      }
 +
 +      /* Offset should be less than i_size */
 +      if (offset >= i_size_read(inode)) {
 +              ret = -EINVAL;
 +              goto out_mutex;
 +      }
 +
 +      truncate_pagecache(inode, ioffset);
 +
 +      /* Wait for existing dio to complete */
 +      ext4_inode_block_unlocked_dio(inode);
 +      inode_dio_wait(inode);
 +
 +      credits = ext4_writepage_trans_blocks(inode);
 +      handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
 +      if (IS_ERR(handle)) {
 +              ret = PTR_ERR(handle);
 +              goto out_dio;
 +      }
 +
 +      /* Expand file to avoid data loss if there is error while shifting */
 +      inode->i_size += len;
 +      EXT4_I(inode)->i_disksize += len;
 +      inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 +      ret = ext4_mark_inode_dirty(handle, inode);
 +      if (ret)
 +              goto out_stop;
 +
 +      down_write(&EXT4_I(inode)->i_data_sem);
 +      ext4_discard_preallocations(inode);
 +
 +      path = ext4_find_extent(inode, offset_lblk, NULL, 0);
 +      if (IS_ERR(path)) {
 +              up_write(&EXT4_I(inode)->i_data_sem);
 +              goto out_stop;
 +      }
 +
 +      depth = ext_depth(inode);
 +      extent = path[depth].p_ext;
 +      if (extent) {
 +              ee_start_lblk = le32_to_cpu(extent->ee_block);
 +              ee_len = ext4_ext_get_actual_len(extent);
 +
 +              /*
 +               * If offset_lblk is not the starting block of extent, split
 +               * the extent @offset_lblk
 +               */
 +              if ((offset_lblk > ee_start_lblk) &&
 +                              (offset_lblk < (ee_start_lblk + ee_len))) {
 +                      if (ext4_ext_is_unwritten(extent))
 +                              split_flag = EXT4_EXT_MARK_UNWRIT1 |
 +                                      EXT4_EXT_MARK_UNWRIT2;
 +                      ret = ext4_split_extent_at(handle, inode, &path,
 +                                      offset_lblk, split_flag,
 +                                      EXT4_EX_NOCACHE |
 +                                      EXT4_GET_BLOCKS_PRE_IO |
 +                                      EXT4_GET_BLOCKS_METADATA_NOFAIL);
 +              }
 +
 +              ext4_ext_drop_refs(path);
 +              kfree(path);
 +              if (ret < 0) {
 +                      up_write(&EXT4_I(inode)->i_data_sem);
 +                      goto out_stop;
 +              }
 +      }
 +
 +      ret = ext4_es_remove_extent(inode, offset_lblk,
 +                      EXT_MAX_BLOCKS - offset_lblk);
 +      if (ret) {
 +              up_write(&EXT4_I(inode)->i_data_sem);
 +              goto out_stop;
 +      }
 +
 +      /*
 +       * if offset_lblk lies in a hole which is at start of file, use
 +       * ee_start_lblk to shift extents
 +       */
 +      ret = ext4_ext_shift_extents(inode, handle,
 +              ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
 +              len_lblk, SHIFT_RIGHT);
 +
 +      up_write(&EXT4_I(inode)->i_data_sem);
 +      if (IS_SYNC(inode))
 +              ext4_handle_sync(handle);
 +
 +out_stop:
 +      ext4_journal_stop(handle);
 +out_dio:
 +      ext4_inode_resume_unlocked_dio(inode);
 +out_mutex:
 +      mutex_unlock(&inode->i_mutex);
 +      return ret;
 +}
 +
  /**
   * ext4_swap_extents - Swap extents between two inodes
   *
@@@ -5775,7 -5535,7 +5776,7 @@@ ext4_swap_extents(handle_t *handle, str
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
        BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
        BUG_ON(!mutex_is_locked(&inode1->i_mutex));
 -      BUG_ON(!mutex_is_locked(&inode1->i_mutex));
 +      BUG_ON(!mutex_is_locked(&inode2->i_mutex));
  
        *erp = ext4_es_remove_extent(inode1, lblk1, count);
        if (unlikely(*erp))
diff --combined fs/ext4/mballoc.c
index 1c535fa67640da69def57f0e88f5c8d5e233c0bc,440987c8ba9ef2dc27eee2092e244d6f4c32bb7b..f6aedf88da437ee324c314bb1020baa51db0423c
@@@ -26,6 -26,7 +26,7 @@@
  #include <linux/log2.h>
  #include <linux/module.h>
  #include <linux/slab.h>
+ #include <linux/backing-dev.h>
  #include <trace/events/ext4.h>
  
  #ifdef CONFIG_EXT4_DEBUG
@@@ -882,8 -883,10 +883,8 @@@ static int ext4_mb_init_cache(struct pa
  
        /* wait for I/O completion */
        for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
 -              if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
 +              if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
                        err = -EIO;
 -                      goto out;
 -              }
        }
  
        first_block = page->index * blocks_per_page;
                        /* skip initialized uptodate buddy */
                        continue;
  
 +              if (!buffer_verified(bh[group - first_group]))
 +                      /* Skip faulty bitmaps */
 +                      continue;
 +              err = 0;
 +
                /*
                 * data carry information regarding this
                 * particular group in the format specified
@@@ -2011,12 -2009,7 +2012,12 @@@ void ext4_mb_scan_aligned(struct ext4_a
        }
  }
  
 -/* This is now called BEFORE we load the buddy bitmap. */
 +/*
 + * This is now called BEFORE we load the buddy bitmap.
 + * Returns either 1 or 0 indicating that the group is either suitable
 + * for the allocation or not. In addition it can also return negative
 + * error code when something goes wrong.
 + */
  static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, int cr)
  {
        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                int ret = ext4_mb_init_group(ac->ac_sb, group);
                if (ret)
 -                      return 0;
 +                      return ret;
        }
  
        fragments = grp->bb_fragments;
@@@ -2086,7 -2079,7 +2087,7 @@@ ext4_mb_regular_allocator(struct ext4_a
  {
        ext4_group_t ngroups, group, i;
        int cr;
 -      int err = 0;
 +      int err = 0, first_err = 0;
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
@@@ -2153,7 -2146,6 +2154,7 @@@ repeat
                group = ac->ac_g_ex.fe_group;
  
                for (i = 0; i < ngroups; group++, i++) {
 +                      int ret = 0;
                        cond_resched();
                        /*
                         * Artificially restricted ngroups for non-extent
                                group = 0;
  
                        /* This now checks without needing the buddy page */
 -                      if (!ext4_mb_good_group(ac, group, cr))
 +                      ret = ext4_mb_good_group(ac, group, cr);
 +                      if (ret <= 0) {
 +                              if (!first_err)
 +                                      first_err = ret;
                                continue;
 +                      }
  
                        err = ext4_mb_load_buddy(sb, group, &e4b);
                        if (err)
                         * We need to check again after locking the
                         * block group
                         */
 -                      if (!ext4_mb_good_group(ac, group, cr)) {
 +                      ret = ext4_mb_good_group(ac, group, cr);
 +                      if (ret <= 0) {
                                ext4_unlock_group(sb, group);
                                ext4_mb_unload_buddy(&e4b);
 +                              if (!first_err)
 +                                      first_err = ret;
                                continue;
                        }
  
                }
        }
  out:
 +      if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
 +              err = first_err;
        return err;
  }
  
@@@ -2275,9 -2258,12 +2276,9 @@@ static int ext4_mb_seq_groups_show(stru
  
        group--;
        if (group == 0)
 -              seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
 -                              "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
 -                                "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
 -                         "group", "free", "frags", "first",
 -                         "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
 -                         "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
 +              seq_puts(seq, "#group: free  frags first ["
 +                            " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
 +                            " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
  
        i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
                sizeof(struct ext4_group_info);
diff --combined fs/ext4/super.c
index 90ec13fe8ac73e5d81cda8ce4d267afbf2ff571a,56b8bb75c3fc3b8809bf310dde5df49139caeb35..a7b4b6e1026920823a149b2f124371cba092e387
@@@ -24,6 -24,7 +24,7 @@@
  #include <linux/slab.h>
  #include <linux/init.h>
  #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
  #include <linux/parser.h>
  #include <linux/buffer_head.h>
  #include <linux/exportfs.h>
@@@ -294,8 -295,6 +295,8 @@@ static void __save_error_info(struct su
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
  
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 +      if (bdev_read_only(sb->s_bdev))
 +              return;
        es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
        es->s_last_error_time = cpu_to_le32(get_seconds());
        strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
@@@ -591,17 -590,14 +592,17 @@@ void __ext4_msg(struct super_block *sb
        va_end(args);
  }
  
 +#define ext4_warning_ratelimit(sb)                                    \
 +              ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
 +                           "EXT4-fs warning")
 +
  void __ext4_warning(struct super_block *sb, const char *function,
                    unsigned int line, const char *fmt, ...)
  {
        struct va_format vaf;
        va_list args;
  
 -      if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
 -                        "EXT4-fs warning"))
 +      if (!ext4_warning_ratelimit(sb))
                return;
  
        va_start(args, fmt);
        va_end(args);
  }
  
 +void __ext4_warning_inode(const struct inode *inode, const char *function,
 +                        unsigned int line, const char *fmt, ...)
 +{
 +      struct va_format vaf;
 +      va_list args;
 +
 +      if (!ext4_warning_ratelimit(inode->i_sb))
 +              return;
 +
 +      va_start(args, fmt);
 +      vaf.fmt = fmt;
 +      vaf.va = &args;
 +      printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
 +             "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
 +             function, line, inode->i_ino, current->comm, &vaf);
 +      va_end(args);
 +}
 +
  void __ext4_grp_locked_error(const char *function, unsigned int line,
                             struct super_block *sb, ext4_group_t grp,
                             unsigned long ino, ext4_fsblk_t block,
@@@ -828,7 -806,6 +829,7 @@@ static void ext4_put_super(struct super
                dump_orphan_list(sb, sbi);
        J_ASSERT(list_empty(&sbi->s_orphan));
  
 +      sync_blockdev(sb->s_bdev);
        invalidate_bdev(sb->s_bdev);
        if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
                /*
@@@ -901,8 -878,9 +902,8 @@@ static struct inode *ext4_alloc_inode(s
        atomic_set(&ei->i_unwritten, 0);
        INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
 -      ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
 +      ei->i_crypt_info = NULL;
  #endif
 -
        return &ei->vfs_inode;
  }
  
@@@ -979,10 -957,6 +980,10 @@@ void ext4_clear_inode(struct inode *ino
                jbd2_free_inode(EXT4_I(inode)->jinode);
                EXT4_I(inode)->jinode = NULL;
        }
 +#ifdef CONFIG_EXT4_FS_ENCRYPTION
 +      if (EXT4_I(inode)->i_crypt_info)
 +              ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
 +#endif
  }
  
  static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@@ -3474,6 -3448,11 +3475,6 @@@ static int ext4_fill_super(struct super
        if (sb->s_bdev->bd_part)
                sbi->s_sectors_written_start =
                        part_stat_read(sb->s_bdev->bd_part, sectors[1]);
 -#ifdef CONFIG_EXT4_FS_ENCRYPTION
 -      /* Modes of operations for file and directory encryption. */
 -      sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
 -      sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
 -#endif
  
        /* Cleanup superblock name */
        for (cp = sb->s_id; (cp = strchr(cp, '/'));)
@@@ -4087,15 -4066,7 +4088,15 @@@ no_journal
                }
        }
  
 -      if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
 +      if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
 +           EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
 +          (blocksize != PAGE_CACHE_SIZE)) {
 +              ext4_msg(sb, KERN_ERR,
 +                       "Unsupported blocksize for fs encryption");
 +              goto failed_mount_wq;
 +      }
 +
 +      if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
            !(sb->s_flags & MS_RDONLY) &&
            !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
@@@ -4971,9 -4942,6 +4972,9 @@@ static int ext4_remount(struct super_bl
                set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
        }
  
 +      if (*flags & MS_LAZYTIME)
 +              sb->s_flags |= MS_LAZYTIME;
 +
        if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
                if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
                        err = -EROFS;
@@@ -5441,7 -5409,6 +5442,7 @@@ static ssize_t ext4_quota_write(struct 
        struct inode *inode = sb_dqopt(sb)->files[type];
        ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
        int err, offset = off & (sb->s_blocksize - 1);
 +      int retries = 0;
        struct buffer_head *bh;
        handle_t *handle = journal_current_handle();
  
                return -EIO;
        }
  
 -      bh = ext4_bread(handle, inode, blk, 1);
 +      do {
 +              bh = ext4_bread(handle, inode, blk,
 +                              EXT4_GET_BLOCKS_CREATE |
 +                              EXT4_GET_BLOCKS_METADATA_NOFAIL);
 +      } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
 +               ext4_should_retry_alloc(inode->i_sb, &retries));
        if (IS_ERR(bh))
                return PTR_ERR(bh);
        if (!bh)
@@@ -5684,7 -5646,6 +5685,7 @@@ out7
  
  static void __exit ext4_exit_fs(void)
  {
 +      ext4_exit_crypto();
        ext4_destroy_lazyinit_thread();
        unregister_as_ext2();
        unregister_as_ext3();
diff --combined fs/f2fs/node.c
index d9c52424bac21555f5a84a2f8ef9f7f2dc7751c5,d211602e0f86f7396553cf422439a94e118a22cc..7dd63b794bfb5a04ae0d203c9ed2c8e739f0eb08
@@@ -53,7 -53,7 +53,7 @@@ bool available_free_memory(struct f2fs_
                                                        PAGE_CACHE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
        } else if (type == DIRTY_DENTS) {
-               if (sbi->sb->s_bdi->dirty_exceeded)
+               if (sbi->sb->s_bdi->wb.dirty_exceeded)
                        return false;
                mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@@ -70,7 -70,7 +70,7 @@@
                                sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
        } else {
-               if (sbi->sb->s_bdi->dirty_exceeded)
+               if (sbi->sb->s_bdi->wb.dirty_exceeded)
                        return false;
        }
        return res;
@@@ -195,35 -195,32 +195,35 @@@ static unsigned int __gang_lookup_nat_s
                                                        start, nr);
  }
  
 -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
 +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
  {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
 -      bool is_cp = true;
 +      bool need = false;
  
        down_read(&nm_i->nat_tree_lock);
        e = __lookup_nat_cache(nm_i, nid);
 -      if (e && !get_nat_flag(e, IS_CHECKPOINTED))
 -              is_cp = false;
 +      if (e) {
 +              if (!get_nat_flag(e, IS_CHECKPOINTED) &&
 +                              !get_nat_flag(e, HAS_FSYNCED_INODE))
 +                      need = true;
 +      }
        up_read(&nm_i->nat_tree_lock);
 -      return is_cp;
 +      return need;
  }
  
 -bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
 +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
  {
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct nat_entry *e;
 -      bool fsynced = false;
 +      bool is_cp = true;
  
        down_read(&nm_i->nat_tree_lock);
 -      e = __lookup_nat_cache(nm_i, ino);
 -      if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
 -              fsynced = true;
 +      e = __lookup_nat_cache(nm_i, nid);
 +      if (e && !get_nat_flag(e, IS_CHECKPOINTED))
 +              is_cp = false;
        up_read(&nm_i->nat_tree_lock);
 -      return fsynced;
 +      return is_cp;
  }
  
  bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
@@@ -315,8 -312,7 +315,8 @@@ static void set_node_addr(struct f2fs_s
        __set_nat_cache_dirty(nm_i, e);
  
        /* update fsync_mark if its inode nat entry is still alive */
 -      e = __lookup_nat_cache(nm_i, ni->ino);
 +      if (ni->nid != ni->ino)
 +              e = __lookup_nat_cache(nm_i, ni->ino);
        if (e) {
                if (fsync_done && ni->nid == ni->ino)
                        set_nat_flag(e, HAS_FSYNCED_INODE, true);
@@@ -999,11 -995,8 +999,11 @@@ static int read_node_page(struct page *
        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
        struct node_info ni;
        struct f2fs_io_info fio = {
 +              .sbi = sbi,
                .type = NODE,
                .rw = rw,
 +              .page = page,
 +              .encrypted_page = NULL,
        };
  
        get_node_info(sbi, page->index, &ni);
                return LOCKED_PAGE;
  
        fio.blk_addr = ni.blk_addr;
 -      return f2fs_submit_page_bio(sbi, page, &fio);
 +      return f2fs_submit_page_bio(&fio);
  }
  
  /*
@@@ -1211,9 -1204,13 +1211,9 @@@ continue_unlock
                        /* called by fsync() */
                        if (ino && IS_DNODE(page)) {
                                set_fsync_mark(page, 1);
 -                              if (IS_INODE(page)) {
 -                                      if (!is_checkpointed_node(sbi, ino) &&
 -                                              !has_fsynced_inode(sbi, ino))
 -                                              set_dentry_mark(page, 1);
 -                                      else
 -                                              set_dentry_mark(page, 0);
 -                              }
 +                              if (IS_INODE(page))
 +                                      set_dentry_mark(page,
 +                                              need_dentry_mark(sbi, ino));
                                nwritten++;
                        } else {
                                set_fsync_mark(page, 0);
@@@ -1296,11 -1293,8 +1296,11 @@@ static int f2fs_write_node_page(struct 
        nid_t nid;
        struct node_info ni;
        struct f2fs_io_info fio = {
 +              .sbi = sbi,
                .type = NODE,
                .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
 +              .page = page,
 +              .encrypted_page = NULL,
        };
  
        trace_f2fs_writepage(page, NODE);
  
        set_page_writeback(page);
        fio.blk_addr = ni.blk_addr;
 -      write_node_page(sbi, page, nid, &fio);
 +      write_node_page(nid, &fio);
        set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        up_read(&sbi->node_write);
diff --combined fs/f2fs/segment.h
index 8496357781188188126c1de28afc55f347d10198,aba72f7a8ac4b45e8b05111cd15f42bb1b26e0cd..79e7b879a75321047bf00fd22d6b55d37a270af7
@@@ -9,6 -9,7 +9,7 @@@
   * published by the Free Software Foundation.
   */
  #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
  
  /* constant macro */
  #define NULL_SEGNO                    ((unsigned int)(~0))
@@@ -163,7 -164,6 +164,7 @@@ struct seg_entry 
         */
        unsigned short ckpt_valid_blocks;
        unsigned char *ckpt_valid_map;
 +      unsigned char *discard_map;
        unsigned char type;             /* segment type like CURSEG_XXX_TYPE */
        unsigned long long mtime;       /* modification time of the segment */
  };
@@@ -714,7 -714,7 +715,7 @@@ static inline unsigned int max_hw_block
   */
  static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
  {
-       if (sbi->sb->s_bdi->dirty_exceeded)
+       if (sbi->sb->s_bdi->wb.dirty_exceeded)
                return 0;
  
        if (type == DATA)
diff --combined fs/inode.c
index e8d62688ed9181e511e2a0e8c6a5f36840cdbe94,efc9edacfb9b4a5da04231e7692b16d59b44736e..069721f0cc0e0b733bb659fb0d7836cd71499690
@@@ -152,7 -152,6 +152,7 @@@ int inode_init_always(struct super_bloc
        inode->i_pipe = NULL;
        inode->i_bdev = NULL;
        inode->i_cdev = NULL;
 +      inode->i_link = NULL;
        inode->i_rdev = 0;
        inode->dirtied_when = 0;
  
@@@ -224,6 -223,7 +224,7 @@@ EXPORT_SYMBOL(free_inode_nonrcu)
  void __destroy_inode(struct inode *inode)
  {
        BUG_ON(inode_has_buffers(inode));
+       inode_detach_wb(inode);
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
        locks_free_lock_context(inode->i_flctx);
@@@ -1585,47 -1585,36 +1586,47 @@@ static int update_time(struct inode *in
   *    This function automatically handles read only file systems and media,
   *    as well as the "noatime" flag and inode specific "noatime" markers.
   */
 -void touch_atime(const struct path *path)
 +bool atime_needs_update(const struct path *path, struct inode *inode)
  {
        struct vfsmount *mnt = path->mnt;
 -      struct inode *inode = d_inode(path->dentry);
        struct timespec now;
  
        if (inode->i_flags & S_NOATIME)
 -              return;
 +              return false;
        if (IS_NOATIME(inode))
 -              return;
 +              return false;
        if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
 -              return;
 +              return false;
  
        if (mnt->mnt_flags & MNT_NOATIME)
 -              return;
 +              return false;
        if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
 -              return;
 +              return false;
  
        now = current_fs_time(inode->i_sb);
  
        if (!relatime_need_update(mnt, inode, now))
 -              return;
 +              return false;
  
        if (timespec_equal(&inode->i_atime, &now))
 +              return false;
 +
 +      return true;
 +}
 +
 +void touch_atime(const struct path *path)
 +{
 +      struct vfsmount *mnt = path->mnt;
 +      struct inode *inode = d_inode(path->dentry);
 +      struct timespec now;
 +
 +      if (!atime_needs_update(path, inode))
                return;
  
        if (!sb_start_write_trylock(inode->i_sb))
                return;
  
 -      if (__mnt_want_write(mnt))
 +      if (__mnt_want_write(mnt) != 0)
                goto skip_update;
        /*
         * File systems can error out when updating inodes if they need to
         * We may also fail on filesystems that have the ability to make parts
         * of the fs read only, e.g. subvolumes in Btrfs.
         */
 +      now = current_fs_time(inode->i_sb);
        update_time(inode, &now, S_ATIME);
        __mnt_drop_write(mnt);
  skip_update:
diff --combined fs/nfs/write.c
index dfc19f1575a19d00bee1b0aeef6575e4416a9ef9,94c7ce01dfb1b27403ca706ab73fe48445163570..e6c262555e08a62aff65ef3baa04e9666e9f18c2
@@@ -853,7 -853,8 +853,8 @@@ static voi
  nfs_clear_page_commit(struct page *page)
  {
        dec_zone_page_state(page, NR_UNSTABLE_NFS);
-       dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+       dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+                   WB_RECLAIMABLE);
  }
  
  /* Called holding inode (/cinfo) lock */
@@@ -1845,15 -1846,12 +1846,15 @@@ int nfs_wb_all(struct inode *inode
        trace_nfs_writeback_inode_enter(inode);
  
        ret = filemap_write_and_wait(inode->i_mapping);
 -      if (!ret) {
 -              ret = nfs_commit_inode(inode, FLUSH_SYNC);
 -              if (!ret)
 -                      pnfs_sync_inode(inode, true);
 -      }
 +      if (ret)
 +              goto out;
 +      ret = nfs_commit_inode(inode, FLUSH_SYNC);
 +      if (ret < 0)
 +              goto out;
 +      pnfs_sync_inode(inode, true);
 +      ret = 0;
  
 +out:
        trace_nfs_writeback_inode_exit(inode, ret);
        return ret;
  }
diff --combined fs/ocfs2/file.c
index fbfadb289e628ce32decb024bab92792e4ebc995,8f1feca89fb08fda21d050e3fcc49e1bb4c88398..719f7f4c7a37bd8cfb292fed77756baed9bacb0b
@@@ -37,6 -37,7 +37,7 @@@
  #include <linux/falloc.h>
  #include <linux/quotaops.h>
  #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
  
  #include <cluster/masklog.h>
  
@@@ -2250,7 -2251,7 +2251,7 @@@ out
  static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                    struct iov_iter *from)
  {
 -      int direct_io, appending, rw_level, have_alloc_sem  = 0;
 +      int direct_io, appending, rw_level;
        int can_do_direct, has_refcount = 0;
        ssize_t written = 0;
        ssize_t ret;
  
        mutex_lock(&inode->i_mutex);
  
 -      ocfs2_iocb_clear_sem_locked(iocb);
 -
  relock:
 -      /* to match setattr's i_mutex -> rw_lock ordering */
 -      if (direct_io) {
 -              have_alloc_sem = 1;
 -              /* communicate with ocfs2_dio_end_io */
 -              ocfs2_iocb_set_sem_locked(iocb);
 -      }
 -
        /*
         * Concurrent O_DIRECT writes are allowed with
         * mount_option "coherency=buffered".
        ret = ocfs2_rw_lock(inode, rw_level);
        if (ret < 0) {
                mlog_errno(ret);
 -              goto out_sems;
 +              goto out_mutex;
        }
  
        /*
        if (direct_io && !can_do_direct) {
                ocfs2_rw_unlock(inode, rw_level);
  
 -              have_alloc_sem = 0;
                rw_level = -1;
  
                direct_io = 0;
@@@ -2406,6 -2417,7 +2407,6 @@@ no_sync
         */
        if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                rw_level = -1;
 -              have_alloc_sem = 0;
                unaligned_dio = 0;
        }
  
@@@ -2418,7 -2430,10 +2419,7 @@@ out
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
  
 -out_sems:
 -      if (have_alloc_sem)
 -              ocfs2_iocb_clear_sem_locked(iocb);
 -
 +out_mutex:
        mutex_unlock(&inode->i_mutex);
  
        if (written)
@@@ -2459,7 -2474,7 +2460,7 @@@ bail
  static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
                                   struct iov_iter *to)
  {
 -      int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
 +      int ret = 0, rw_level = -1, lock_level = 0;
        struct file *filp = iocb->ki_filp;
        struct inode *inode = file_inode(filp);
  
                goto bail;
        }
  
 -      ocfs2_iocb_clear_sem_locked(iocb);
 -
        /*
         * buffered reads protect themselves in ->readpage().  O_DIRECT reads
         * need locks to protect pending reads from racing with truncate.
         */
        if (iocb->ki_flags & IOCB_DIRECT) {
 -              have_alloc_sem = 1;
 -              ocfs2_iocb_set_sem_locked(iocb);
 -
                ret = ocfs2_rw_lock(inode, 0);
                if (ret < 0) {
                        mlog_errno(ret);
        /* see ocfs2_file_write_iter */
        if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
                rw_level = -1;
 -              have_alloc_sem = 0;
        }
  
  bail:
 -      if (have_alloc_sem)
 -              ocfs2_iocb_clear_sem_locked(iocb);
 -
        if (rw_level != -1)
                ocfs2_rw_unlock(inode, rw_level);
  
diff --combined fs/xfs/xfs_file.c
index 3b7591224f4a6698d32371a927e70cb2a391f4a9,4e00b38efbe0925f834999401e286682c630dd72..7c62fca53e2fc36b5c61f37a829e6532b7e210d2
@@@ -41,6 -41,7 +41,7 @@@
  #include <linux/dcache.h>
  #include <linux/falloc.h>
  #include <linux/pagevec.h>
+ #include <linux/backing-dev.h>
  
  static const struct vm_operations_struct xfs_file_vm_ops;
  
@@@ -124,7 -125,7 +125,7 @@@ xfs_iozero
                status = 0;
        } while (count);
  
 -      return (-status);
 +      return status;
  }
  
  int
index d87d8eced06407c59c6d231f9e707bdcc398ce52,a13181a42b9aee87a652d1a94a7e322dc6aaaf6e..0e6d4828a77a358edd3c77ef7d14eecc6f6001b3
  #ifndef _LINUX_BACKING_DEV_H
  #define _LINUX_BACKING_DEV_H
  
- #include <linux/percpu_counter.h>
- #include <linux/log2.h>
- #include <linux/flex_proportions.h>
  #include <linux/kernel.h>
  #include <linux/fs.h>
  #include <linux/sched.h>
- #include <linux/timer.h>
+ #include <linux/blkdev.h>
  #include <linux/writeback.h>
- #include <linux/atomic.h>
- #include <linux/sysctl.h>
- #include <linux/workqueue.h>
- struct page;
- struct device;
- struct dentry;
- /*
-  * Bits in backing_dev_info.state
-  */
- enum bdi_state {
-       BDI_async_congested,    /* The async (write) queue is getting full */
-       BDI_sync_congested,     /* The sync queue is getting full */
-       BDI_registered,         /* bdi_register() was done */
-       BDI_writeback_running,  /* Writeback is in progress */
- };
- typedef int (congested_fn)(void *, int);
- enum bdi_stat_item {
-       BDI_RECLAIMABLE,
-       BDI_WRITEBACK,
-       BDI_DIRTIED,
-       BDI_WRITTEN,
-       NR_BDI_STAT_ITEMS
- };
- #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
- struct bdi_writeback {
-       struct backing_dev_info *bdi;   /* our parent bdi */
-       unsigned long last_old_flush;   /* last old data flush */
-       struct delayed_work dwork;      /* work item used for writeback */
-       struct list_head b_dirty;       /* dirty inodes */
-       struct list_head b_io;          /* parked for writeback */
-       struct list_head b_more_io;     /* parked for more writeback */
-       struct list_head b_dirty_time;  /* time stamps are dirty */
-       spinlock_t list_lock;           /* protects the b_* lists */
- };
- struct backing_dev_info {
-       struct list_head bdi_list;
-       unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
-       unsigned long state;    /* Always use atomic bitops on this */
-       unsigned int capabilities; /* Device capabilities */
-       congested_fn *congested_fn; /* Function pointer if device is md/dm */
-       void *congested_data;   /* Pointer to aux data for congested func */
-       char *name;
-       struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
-       unsigned long bw_time_stamp;    /* last time write bw is updated */
-       unsigned long dirtied_stamp;
-       unsigned long written_stamp;    /* pages written at bw_time_stamp */
-       unsigned long write_bandwidth;  /* the estimated write bandwidth */
-       unsigned long avg_write_bandwidth; /* further smoothed write bw */
-       /*
-        * The base dirty throttle rate, re-calculated on every 200ms.
-        * All the bdi tasks' dirty rate will be curbed under it.
-        * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
-        * in small steps and is much more smooth/stable than the latter.
-        */
-       unsigned long dirty_ratelimit;
-       unsigned long balanced_dirty_ratelimit;
-       struct fprop_local_percpu completions;
-       int dirty_exceeded;
-       unsigned int min_ratio;
-       unsigned int max_ratio, max_prop_frac;
-       struct bdi_writeback wb;  /* default writeback info for this bdi */
-       spinlock_t wb_lock;       /* protects work_list & wb.dwork scheduling */
-       struct list_head work_list;
-       struct device *dev;
-       struct timer_list laptop_mode_wb_timer;
- #ifdef CONFIG_DEBUG_FS
-       struct dentry *debug_dir;
-       struct dentry *debug_stats;
- #endif
- };
- struct backing_dev_info *inode_to_bdi(struct inode *inode);
+ #include <linux/blk-cgroup.h>
+ #include <linux/backing-dev-defs.h>
  
  int __must_check bdi_init(struct backing_dev_info *bdi);
  void bdi_destroy(struct backing_dev_info *bdi);
@@@ -116,98 -23,101 +23,100 @@@ __printf(3, 4
  int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                const char *fmt, ...);
  int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
 -void bdi_unregister(struct backing_dev_info *bdi);
  int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
- void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                       enum wb_reason reason);
- void bdi_start_background_writeback(struct backing_dev_info *bdi);
- void bdi_writeback_workfn(struct work_struct *work);
- int bdi_has_dirty_io(struct backing_dev_info *bdi);
- void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+                       bool range_cyclic, enum wb_reason reason);
+ void wb_start_background_writeback(struct bdi_writeback *wb);
+ void wb_workfn(struct work_struct *work);
+ void wb_wakeup_delayed(struct bdi_writeback *wb);
  
  extern spinlock_t bdi_lock;
  extern struct list_head bdi_list;
  
  extern struct workqueue_struct *bdi_wq;
  
- static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+ static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
  {
-       return !list_empty(&wb->b_dirty) ||
-              !list_empty(&wb->b_io) ||
-              !list_empty(&wb->b_more_io);
+       return test_bit(WB_has_dirty_io, &wb->state);
+ }
+ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
+ {
+       /*
+        * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
+        * any dirty wbs.  See wb_update_write_bandwidth().
+        */
+       return atomic_long_read(&bdi->tot_write_bandwidth);
  }
  
- static inline void __add_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item, s64 amount)
+ static inline void __add_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item, s64 amount)
  {
-       __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+       __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
  }
  
- static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void __inc_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item)
  {
-       __add_bdi_stat(bdi, item, 1);
+       __add_wb_stat(wb, item, 1);
  }
  
- static inline void inc_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
  {
        unsigned long flags;
  
        local_irq_save(flags);
-       __inc_bdi_stat(bdi, item);
+       __inc_wb_stat(wb, item);
        local_irq_restore(flags);
  }
  
- static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void __dec_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item)
  {
-       __add_bdi_stat(bdi, item, -1);
+       __add_wb_stat(wb, item, -1);
  }
  
- static inline void dec_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
  {
        unsigned long flags;
  
        local_irq_save(flags);
-       __dec_bdi_stat(bdi, item);
+       __dec_wb_stat(wb, item);
        local_irq_restore(flags);
  }
  
- static inline s64 bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
  {
-       return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+       return percpu_counter_read_positive(&wb->stat[item]);
  }
  
- static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
+                               enum wb_stat_item item)
  {
-       return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+       return percpu_counter_sum_positive(&wb->stat[item]);
  }
  
- static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
  {
        s64 sum;
        unsigned long flags;
  
        local_irq_save(flags);
-       sum = __bdi_stat_sum(bdi, item);
+       sum = __wb_stat_sum(wb, item);
        local_irq_restore(flags);
  
        return sum;
  }
  
- extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+ extern void wb_writeout_inc(struct bdi_writeback *wb);
  
  /*
   * maximal error of a stat counter.
   */
- static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+ static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
  {
  #ifdef CONFIG_SMP
-       return nr_cpu_ids * BDI_STAT_BATCH;
+       return nr_cpu_ids * WB_STAT_BATCH;
  #else
        return 1;
  #endif
@@@ -231,50 -141,57 +140,57 @@@ int bdi_set_max_ratio(struct backing_de
   * BDI_CAP_NO_WRITEBACK:   Don't write pages back
   * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
   * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
+  *
+  * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
   */
  #define BDI_CAP_NO_ACCT_DIRTY 0x00000001
  #define BDI_CAP_NO_WRITEBACK  0x00000002
  #define BDI_CAP_NO_ACCT_WB    0x00000004
  #define BDI_CAP_STABLE_WRITES 0x00000008
  #define BDI_CAP_STRICTLIMIT   0x00000010
+ #define BDI_CAP_CGROUP_WRITEBACK 0x00000020
  
  #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
        (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
  
  extern struct backing_dev_info noop_backing_dev_info;
  
- int writeback_in_progress(struct backing_dev_info *bdi);
- static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+ /**
+  * writeback_in_progress - determine whether there is writeback in progress
+  * @wb: bdi_writeback of interest
+  *
+  * Determine whether there is writeback waiting to be handled against a
+  * bdi_writeback.
+  */
+ static inline bool writeback_in_progress(struct bdi_writeback *wb)
  {
-       if (bdi->congested_fn)
-               return bdi->congested_fn(bdi->congested_data, bdi_bits);
-       return (bdi->state & bdi_bits);
+       return test_bit(WB_writeback_running, &wb->state);
  }
  
- static inline int bdi_read_congested(struct backing_dev_info *bdi)
+ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
  {
-       return bdi_congested(bdi, 1 << BDI_sync_congested);
- }
+       struct super_block *sb;
  
- static inline int bdi_write_congested(struct backing_dev_info *bdi)
- {
-       return bdi_congested(bdi, 1 << BDI_async_congested);
+       if (!inode)
+               return &noop_backing_dev_info;
+       sb = inode->i_sb;
+ #ifdef CONFIG_BLOCK
+       if (sb_is_blkdev_sb(sb))
+               return blk_get_backing_dev_info(I_BDEV(inode));
+ #endif
+       return sb->s_bdi;
  }
  
- static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
  {
-       return bdi_congested(bdi, (1 << BDI_sync_congested) |
-                                 (1 << BDI_async_congested));
- }
+       struct backing_dev_info *bdi = wb->bdi;
  
- enum {
-       BLK_RW_ASYNC    = 0,
-       BLK_RW_SYNC     = 1,
- };
+       if (bdi->congested_fn)
+               return bdi->congested_fn(bdi->congested_data, cong_bits);
+       return wb->congested->state & cong_bits;
+ }
  
- void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
- void set_bdi_congested(struct backing_dev_info *bdi, int sync);
  long congestion_wait(int sync, long timeout);
  long wait_iff_congested(struct zone *zone, int sync, long timeout);
  int pdflush_proc_obsolete(struct ctl_table *table, int write,
@@@ -318,4 -235,333 +234,333 @@@ static inline int bdi_sched_wait(void *
        return 0;
  }
  
- #endif                /* _LINUX_BACKING_DEV_H */
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ struct bdi_writeback_congested *
+ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
+ void wb_congested_put(struct bdi_writeback_congested *congested);
+ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+                                   struct cgroup_subsys_state *memcg_css,
+                                   gfp_t gfp);
+ void wb_memcg_offline(struct mem_cgroup *memcg);
+ void wb_blkcg_offline(struct blkcg *blkcg);
+ int inode_congested(struct inode *inode, int cong_bits);
+ /**
+  * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
+  * @inode: inode of interest
+  *
+  * cgroup writeback requires support from both the bdi and filesystem.
+  * Test whether @inode has both.
+  */
+ static inline bool inode_cgwb_enabled(struct inode *inode)
+ {
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       return bdi_cap_account_dirty(bdi) &&
+               (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+               (inode->i_sb->s_iflags & SB_I_CGROUPWB);
+ }
+ /**
+  * wb_find_current - find wb for %current on a bdi
+  * @bdi: bdi of interest
+  *
+  * Find the wb of @bdi which matches both the memcg and blkcg of %current.
+  * Must be called under rcu_read_lock() which protects the returend wb.
+  * NULL if not found.
+  */
+ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+ {
+       struct cgroup_subsys_state *memcg_css;
+       struct bdi_writeback *wb;
+       memcg_css = task_css(current, memory_cgrp_id);
+       if (!memcg_css->parent)
+               return &bdi->wb;
+       wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+       /*
+        * %current's blkcg equals the effective blkcg of its memcg.  No
+        * need to use the relatively expensive cgroup_get_e_css().
+        */
+       if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+               return wb;
+       return NULL;
+ }
+ /**
+  * wb_get_create_current - get or create wb for %current on a bdi
+  * @bdi: bdi of interest
+  * @gfp: allocation mask
+  *
+  * Equivalent to wb_get_create() on %current's memcg.  This function is
+  * called from a relatively hot path and optimizes the common cases using
+  * wb_find_current().
+  */
+ static inline struct bdi_writeback *
+ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+ {
+       struct bdi_writeback *wb;
+       rcu_read_lock();
+       wb = wb_find_current(bdi);
+       if (wb && unlikely(!wb_tryget(wb)))
+               wb = NULL;
+       rcu_read_unlock();
+       if (unlikely(!wb)) {
+               struct cgroup_subsys_state *memcg_css;
+               memcg_css = task_get_css(current, memory_cgrp_id);
+               wb = wb_get_create(bdi, memcg_css, gfp);
+               css_put(memcg_css);
+       }
+       return wb;
+ }
+ /**
+  * inode_to_wb_is_valid - test whether an inode has a wb associated
+  * @inode: inode of interest
+  *
+  * Returns %true if @inode has a wb associated.  May be called without any
+  * locking.
+  */
+ static inline bool inode_to_wb_is_valid(struct inode *inode)
+ {
+       return inode->i_wb;
+ }
+ /**
+  * inode_to_wb - determine the wb of an inode
+  * @inode: inode of interest
+  *
+  * Returns the wb @inode is currently associated with.  The caller must be
+  * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+  * associated wb's list_lock.
+  */
+ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+ {
+ #ifdef CONFIG_LOCKDEP
+       WARN_ON_ONCE(debug_locks &&
+                    (!lockdep_is_held(&inode->i_lock) &&
+                     !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+                     !lockdep_is_held(&inode->i_wb->list_lock)));
+ #endif
+       return inode->i_wb;
+ }
+ /**
+  * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+  * @inode: target inode
+  * @lockedp: temp bool output param, to be passed to the end function
+  *
+  * The caller wants to access the wb associated with @inode but isn't
+  * holding inode->i_lock, mapping->tree_lock or wb->list_lock.  This
+  * function determines the wb associated with @inode and ensures that the
+  * association doesn't change until the transaction is finished with
+  * unlocked_inode_to_wb_end().
+  *
+  * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+  * afterwards and can't sleep during transaction.  IRQ may or may not be
+  * disabled on return.
+  */
+ static inline struct bdi_writeback *
+ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+ {
+       rcu_read_lock();
+       /*
+        * Paired with store_release in inode_switch_wb_work_fn() and
+        * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+        */
+       *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+       if (unlikely(*lockedp))
+               spin_lock_irq(&inode->i_mapping->tree_lock);
+       /*
+        * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+        * inode_to_wb() will bark.  Deref directly.
+        */
+       return inode->i_wb;
+ }
+ /**
+  * unlocked_inode_to_wb_end - end inode wb access transaction
+  * @inode: target inode
+  * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+  */
+ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+ {
+       if (unlikely(locked))
+               spin_unlock_irq(&inode->i_mapping->tree_lock);
+       rcu_read_unlock();
+ }
+ struct wb_iter {
+       int                     start_blkcg_id;
+       struct radix_tree_iter  tree_iter;
+       void                    **slot;
+ };
+ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
+                                                  struct backing_dev_info *bdi)
+ {
+       struct radix_tree_iter *titer = &iter->tree_iter;
+       WARN_ON_ONCE(!rcu_read_lock_held());
+       if (iter->start_blkcg_id >= 0) {
+               iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
+               iter->start_blkcg_id = -1;
+       } else {
+               iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
+       }
+       if (!iter->slot)
+               iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
+       if (iter->slot)
+               return *iter->slot;
+       return NULL;
+ }
+ static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
+                                                  struct backing_dev_info *bdi,
+                                                  int start_blkcg_id)
+ {
+       iter->start_blkcg_id = start_blkcg_id;
+       if (start_blkcg_id)
+               return __wb_iter_next(iter, bdi);
+       else
+               return &bdi->wb;
+ }
+ /**
+  * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+  * @wb_cur: cursor struct bdi_writeback pointer
+  * @bdi: bdi to walk wb's of
+  * @iter: pointer to struct wb_iter to be used as iteration buffer
+  * @start_blkcg_id: blkcg ID to start iteration from
+  *
+  * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
+  * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+  * to be used as temp storage during iteration.  rcu_read_lock() must be
+  * held throughout iteration.
+  */
+ #define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)            \
+       for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);      \
+            (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ static inline bool inode_cgwb_enabled(struct inode *inode)
+ {
+       return false;
+ }
+ static inline struct bdi_writeback_congested *
+ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+ {
+       return bdi->wb.congested;
+ }
+ static inline void wb_congested_put(struct bdi_writeback_congested *congested)
+ {
+ }
+ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+ {
+       return &bdi->wb;
+ }
+ static inline struct bdi_writeback *
+ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+ {
+       return &bdi->wb;
+ }
+ static inline bool inode_to_wb_is_valid(struct inode *inode)
+ {
+       return true;
+ }
+ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+ {
+       return &inode_to_bdi(inode)->wb;
+ }
+ static inline struct bdi_writeback *
+ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+ {
+       return inode_to_wb(inode);
+ }
+ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+ {
+ }
+ static inline void wb_memcg_offline(struct mem_cgroup *memcg)
+ {
+ }
+ static inline void wb_blkcg_offline(struct blkcg *blkcg)
+ {
+ }
+ struct wb_iter {
+       int             next_id;
+ };
+ #define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)            \
+       for ((iter)->next_id = (start_blkcg_id);                        \
+            ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
+ static inline int inode_congested(struct inode *inode, int cong_bits)
+ {
+       return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+ }
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
+ static inline int inode_read_congested(struct inode *inode)
+ {
+       return inode_congested(inode, 1 << WB_sync_congested);
+ }
+ static inline int inode_write_congested(struct inode *inode)
+ {
+       return inode_congested(inode, 1 << WB_async_congested);
+ }
+ static inline int inode_rw_congested(struct inode *inode)
+ {
+       return inode_congested(inode, (1 << WB_sync_congested) |
+                                     (1 << WB_async_congested));
+ }
+ static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+ {
+       return wb_congested(&bdi->wb, cong_bits);
+ }
+ static inline int bdi_read_congested(struct backing_dev_info *bdi)
+ {
+       return bdi_congested(bdi, 1 << WB_sync_congested);
+ }
+ static inline int bdi_write_congested(struct backing_dev_info *bdi)
+ {
+       return bdi_congested(bdi, 1 << WB_async_congested);
+ }
+ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+ {
+       return bdi_congested(bdi, (1 << WB_sync_congested) |
+                                 (1 << WB_async_congested));
+ }
+ #endif        /* _LINUX_BACKING_DEV_H */
index 0000000000000000000000000000000000000000,07a32b813ed897d2610fc4abcc205dc6f5f0f0af..58cfab80dd707ff28d8b4e12fdf735bc24f7f60b
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,631 +1,655 @@@
 -/* CFQ specific, out here for blkcg->cfq_weight */
 -#define CFQ_WEIGHT_MIN                10
 -#define CFQ_WEIGHT_MAX                1000
 -#define CFQ_WEIGHT_DEFAULT    500
 -
+ #ifndef _BLK_CGROUP_H
+ #define _BLK_CGROUP_H
+ /*
+  * Common Block IO controller cgroup interface
+  *
+  * Based on ideas and code from CFQ, CFS and BFQ:
+  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+  *
+  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+  *                  Paolo Valente <paolo.valente@unimore.it>
+  *
+  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+  *                  Nauman Rafique <nauman@google.com>
+  */
+ #include <linux/cgroup.h>
+ #include <linux/u64_stats_sync.h>
+ #include <linux/seq_file.h>
+ #include <linux/radix-tree.h>
+ #include <linux/blkdev.h>
+ #include <linux/atomic.h>
+ /* Max limits for throttle policy */
+ #define THROTL_IOPS_MAX               UINT_MAX
 -      /* TODO: per-policy storage in blkcg */
 -      unsigned int                    cfq_weight;     /* belongs to cfq */
 -      unsigned int                    cfq_leaf_weight;
+ #ifdef CONFIG_BLK_CGROUP
+ enum blkg_rwstat_type {
+       BLKG_RWSTAT_READ,
+       BLKG_RWSTAT_WRITE,
+       BLKG_RWSTAT_SYNC,
+       BLKG_RWSTAT_ASYNC,
+       BLKG_RWSTAT_NR,
+       BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+ };
+ struct blkcg_gq;
+ struct blkcg {
+       struct cgroup_subsys_state      css;
+       spinlock_t                      lock;
+       struct radix_tree_root          blkg_tree;
+       struct blkcg_gq                 *blkg_hint;
+       struct hlist_head               blkg_list;
++      struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head                cgwb_list;
+ #endif
+ };
+ struct blkg_stat {
+       struct u64_stats_sync           syncp;
+       uint64_t                        cnt;
+ };
+ struct blkg_rwstat {
+       struct u64_stats_sync           syncp;
+       uint64_t                        cnt[BLKG_RWSTAT_NR];
+ };
+ /*
+  * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+  * request_queue (q).  This is used by blkcg policies which need to track
+  * information per blkcg - q pair.
+  *
+  * There can be multiple active blkcg policies and each has its private
+  * data on each blkg, the size of which is determined by
+  * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+  * together with blkg and invokes pd_init/exit_fn() methods.
+  *
+  * Such private data must embed struct blkg_policy_data (pd) at the
+  * beginning and pd_size can't be smaller than pd.
+  */
+ struct blkg_policy_data {
+       /* the blkg and policy id this per-policy data belongs to */
+       struct blkcg_gq                 *blkg;
+       int                             plid;
+       /* used during policy activation */
+       struct list_head                alloc_node;
+ };
++/*
++ * Policies that need to keep per-blkcg data which is independent
++ * from any request_queue associated to it must specify its size
++ * with the cpd_size field of the blkcg_policy structure and
++ * embed a blkcg_policy_data in it. blkcg core allocates
++ * policy-specific per-blkcg structures lazily the first time
++ * they are actually needed, so it handles them together with
++ * blkgs. cpd_init() is invoked to let each policy handle
++ * per-blkcg data.
++ */
++struct blkcg_policy_data {
++      /* the policy id this per-policy data belongs to */
++      int                             plid;
++
++      /* used during policy activation */
++      struct list_head                alloc_node;
++};
++
+ /* association between a blk cgroup and a request queue */
+ struct blkcg_gq {
+       /* Pointer to the associated request_queue */
+       struct request_queue            *q;
+       struct list_head                q_node;
+       struct hlist_node               blkcg_node;
+       struct blkcg                    *blkcg;
+       /*
+        * Each blkg gets congested separately and the congestion state is
+        * propagated to the matching bdi_writeback_congested.
+        */
+       struct bdi_writeback_congested  *wb_congested;
+       /* all non-root blkcg_gq's are guaranteed to have access to parent */
+       struct blkcg_gq                 *parent;
+       /* request allocation list for this blkcg-q pair */
+       struct request_list             rl;
+       /* reference count */
+       atomic_t                        refcnt;
+       /* is this blkg online? protected by both blkcg and q locks */
+       bool                            online;
+       struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
+       struct rcu_head                 rcu_head;
+ };
++typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+ typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+ struct blkcg_policy {
+       int                             plid;
+       /* policy specific private data size */
+       size_t                          pd_size;
++      /* policy specific per-blkcg data size */
++      size_t                          cpd_size;
+       /* cgroup files for the policy */
+       struct cftype                   *cftypes;
+       /* operations */
++      blkcg_pol_init_cpd_fn           *cpd_init_fn;
+       blkcg_pol_init_pd_fn            *pd_init_fn;
+       blkcg_pol_online_pd_fn          *pd_online_fn;
+       blkcg_pol_offline_pd_fn         *pd_offline_fn;
+       blkcg_pol_exit_pd_fn            *pd_exit_fn;
+       blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
+ };
+ extern struct blkcg blkcg_root;
+ extern struct cgroup_subsys_state * const blkcg_root_css;
+ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+                                   struct request_queue *q);
+ int blkcg_init_queue(struct request_queue *q);
+ void blkcg_drain_queue(struct request_queue *q);
+ void blkcg_exit_queue(struct request_queue *q);
+ /* Blkio controller policy registration */
+ int blkcg_policy_register(struct blkcg_policy *pol);
+ void blkcg_policy_unregister(struct blkcg_policy *pol);
+ int blkcg_activate_policy(struct request_queue *q,
+                         const struct blkcg_policy *pol);
+ void blkcg_deactivate_policy(struct request_queue *q,
+                            const struct blkcg_policy *pol);
+ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+                      u64 (*prfill)(struct seq_file *,
+                                    struct blkg_policy_data *, int),
+                      const struct blkcg_policy *pol, int data,
+                      bool show_total);
+ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+                        const struct blkg_rwstat *rwstat);
+ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+                      int off);
+ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+                                            int off);
+ struct blkg_conf_ctx {
+       struct gendisk                  *disk;
+       struct blkcg_gq                 *blkg;
+       u64                             v;
+ };
+ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+                  const char *input, struct blkg_conf_ctx *ctx);
+ void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+ {
+       return css ? container_of(css, struct blkcg, css) : NULL;
+ }
+ static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+ {
+       return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+ }
+ static inline struct blkcg *bio_blkcg(struct bio *bio)
+ {
+       if (bio && bio->bi_css)
+               return css_to_blkcg(bio->bi_css);
+       return task_blkcg(current);
+ }
+ static inline struct cgroup_subsys_state *
+ task_get_blkcg_css(struct task_struct *task)
+ {
+       return task_get_css(task, blkio_cgrp_id);
+ }
+ /**
+  * blkcg_parent - get the parent of a blkcg
+  * @blkcg: blkcg of interest
+  *
+  * Return the parent blkcg of @blkcg.  Can be called anytime.
+  */
+ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+ {
+       return css_to_blkcg(blkcg->css.parent);
+ }
+ /**
+  * blkg_to_pdata - get policy private data
+  * @blkg: blkg of interest
+  * @pol: policy of interest
+  *
+  * Return pointer to private data associated with the @blkg-@pol pair.
+  */
+ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+                                                 struct blkcg_policy *pol)
+ {
+       return blkg ? blkg->pd[pol->plid] : NULL;
+ }
++static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
++                                                   struct blkcg_policy *pol)
++{
++      return blkcg ? blkcg->pd[pol->plid] : NULL;
++}
++
+ /**
+  * pdata_to_blkg - get blkg associated with policy private data
+  * @pd: policy private data of interest
+  *
+  * @pd is policy private data.  Determine the blkg it's associated with.
+  */
+ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+ {
+       return pd ? pd->blkg : NULL;
+ }
+ /**
+  * blkg_path - format cgroup path of blkg
+  * @blkg: blkg of interest
+  * @buf: target buffer
+  * @buflen: target buffer length
+  *
+  * Format the path of the cgroup of @blkg into @buf.
+  */
+ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+ {
+       char *p;
+       p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+       if (!p) {
+               strncpy(buf, "<unavailable>", buflen);
+               return -ENAMETOOLONG;
+       }
+       memmove(buf, p, buf + buflen - p);
+       return 0;
+ }
+ /**
+  * blkg_get - get a blkg reference
+  * @blkg: blkg to get
+  *
+  * The caller should be holding an existing reference.
+  */
+ static inline void blkg_get(struct blkcg_gq *blkg)
+ {
+       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+       atomic_inc(&blkg->refcnt);
+ }
+ void __blkg_release_rcu(struct rcu_head *rcu);
+ /**
+  * blkg_put - put a blkg reference
+  * @blkg: blkg to put
+  */
+ static inline void blkg_put(struct blkcg_gq *blkg)
+ {
+       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+       if (atomic_dec_and_test(&blkg->refcnt))
+               call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+ }
+ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+                              bool update_hint);
+ /**
+  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+  * @d_blkg: loop cursor pointing to the current descendant
+  * @pos_css: used for iteration
+  * @p_blkg: target blkg to walk descendants of
+  *
+  * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+  * read locked.  If called under either blkcg or queue lock, the iteration
+  * is guaranteed to include all and only online blkgs.  The caller may
+  * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+  * @p_blkg is included in the iteration and the first node to be visited.
+  */
+ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)         \
+       css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)   \
+               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
+                                             (p_blkg)->q, false)))
+ /**
+  * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+  * @d_blkg: loop cursor pointing to the current descendant
+  * @pos_css: used for iteration
+  * @p_blkg: target blkg to walk descendants of
+  *
+  * Similar to blkg_for_each_descendant_pre() but performs post-order
+  * traversal instead.  Synchronization rules are the same.  @p_blkg is
+  * included in the iteration and the last node to be visited.
+  */
+ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)                \
+       css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)  \
+               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
+                                             (p_blkg)->q, false)))
+ /**
+  * blk_get_rl - get request_list to use
+  * @q: request_queue of interest
+  * @bio: bio which will be attached to the allocated request (may be %NULL)
+  *
+  * The caller wants to allocate a request from @q to use for @bio.  Find
+  * the request_list to use and obtain a reference on it.  Should be called
+  * under queue_lock.  This function is guaranteed to return non-%NULL
+  * request_list.
+  */
+ static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio)
+ {
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+       rcu_read_lock();
+       blkcg = bio_blkcg(bio);
+       /* bypass blkg lookup and use @q->root_rl directly for root */
+       if (blkcg == &blkcg_root)
+               goto root_rl;
+       /*
+        * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+        * or if either the blkcg or queue is going away.  Fall back to
+        * root_rl in such cases.
+        */
+       blkg = blkg_lookup_create(blkcg, q);
+       if (unlikely(IS_ERR(blkg)))
+               goto root_rl;
+       blkg_get(blkg);
+       rcu_read_unlock();
+       return &blkg->rl;
+ root_rl:
+       rcu_read_unlock();
+       return &q->root_rl;
+ }
+ /**
+  * blk_put_rl - put request_list
+  * @rl: request_list to put
+  *
+  * Put the reference acquired by blk_get_rl().  Should be called under
+  * queue_lock.
+  */
+ static inline void blk_put_rl(struct request_list *rl)
+ {
+       /* root_rl may not have blkg set */
+       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+               blkg_put(rl->blkg);
+ }
+ /**
+  * blk_rq_set_rl - associate a request with a request_list
+  * @rq: request of interest
+  * @rl: target request_list
+  *
+  * Associate @rq with @rl so that accounting and freeing can know the
+  * request_list @rq came from.
+  */
+ static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+ {
+       rq->rl = rl;
+ }
+ /**
+  * blk_rq_rl - return the request_list a request came from
+  * @rq: request of interest
+  *
+  * Return the request_list @rq is allocated from.
+  */
+ static inline struct request_list *blk_rq_rl(struct request *rq)
+ {
+       return rq->rl;
+ }
+ struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                        struct request_queue *q);
+ /**
+  * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+  *
+  * Should be used under queue_lock.
+  */
+ #define blk_queue_for_each_rl(rl, q)  \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+ static inline void blkg_stat_init(struct blkg_stat *stat)
+ {
+       u64_stats_init(&stat->syncp);
+ }
+ /**
+  * blkg_stat_add - add a value to a blkg_stat
+  * @stat: target blkg_stat
+  * @val: value to add
+  *
+  * Add @val to @stat.  The caller is responsible for synchronizing calls to
+  * this function.
+  */
+ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+ {
+       u64_stats_update_begin(&stat->syncp);
+       stat->cnt += val;
+       u64_stats_update_end(&stat->syncp);
+ }
+ /**
+  * blkg_stat_read - read the current value of a blkg_stat
+  * @stat: blkg_stat to read
+  *
+  * Read the current value of @stat.  This function can be called without
+  * synchroniztion and takes care of u64 atomicity.
+  */
+ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+ {
+       unsigned int start;
+       uint64_t v;
+       do {
+               start = u64_stats_fetch_begin_irq(&stat->syncp);
+               v = stat->cnt;
+       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
+       return v;
+ }
+ /**
+  * blkg_stat_reset - reset a blkg_stat
+  * @stat: blkg_stat to reset
+  */
+ static inline void blkg_stat_reset(struct blkg_stat *stat)
+ {
+       stat->cnt = 0;
+ }
+ /**
+  * blkg_stat_merge - merge a blkg_stat into another
+  * @to: the destination blkg_stat
+  * @from: the source
+  *
+  * Add @from's count to @to.
+  */
+ static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+ {
+       blkg_stat_add(to, blkg_stat_read(from));
+ }
+ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+ {
+       u64_stats_init(&rwstat->syncp);
+ }
+ /**
+  * blkg_rwstat_add - add a value to a blkg_rwstat
+  * @rwstat: target blkg_rwstat
+  * @rw: mask of REQ_{WRITE|SYNC}
+  * @val: value to add
+  *
+  * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+  * caller is responsible for synchronizing calls to this function.
+  */
+ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+                                  int rw, uint64_t val)
+ {
+       u64_stats_update_begin(&rwstat->syncp);
+       if (rw & REQ_WRITE)
+               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+       else
+               rwstat->cnt[BLKG_RWSTAT_READ] += val;
+       if (rw & REQ_SYNC)
+               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+       else
+               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+       u64_stats_update_end(&rwstat->syncp);
+ }
+ /**
+  * blkg_rwstat_read - read the current values of a blkg_rwstat
+  * @rwstat: blkg_rwstat to read
+  *
+  * Read the current snapshot of @rwstat and return it as the return value.
+  * This function can be called without synchronization and takes care of
+  * u64 atomicity.
+  */
+ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+ {
+       unsigned int start;
+       struct blkg_rwstat tmp;
+       do {
+               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
+               tmp = *rwstat;
+       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+       return tmp;
+ }
+ /**
+  * blkg_rwstat_total - read the total count of a blkg_rwstat
+  * @rwstat: blkg_rwstat to read
+  *
+  * Return the total count of @rwstat regardless of the IO direction.  This
+  * function can be called without synchronization and takes care of u64
+  * atomicity.
+  */
+ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+ {
+       struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+ }
+ /**
+  * blkg_rwstat_reset - reset a blkg_rwstat
+  * @rwstat: blkg_rwstat to reset
+  */
+ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+ {
+       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+ }
+ /**
+  * blkg_rwstat_merge - merge a blkg_rwstat into another
+  * @to: the destination blkg_rwstat
+  * @from: the source
+  *
+  * Add @from's counts to @to.
+  */
+ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+                                    struct blkg_rwstat *from)
+ {
+       struct blkg_rwstat v = blkg_rwstat_read(from);
+       int i;
+       u64_stats_update_begin(&to->syncp);
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               to->cnt[i] += v.cnt[i];
+       u64_stats_update_end(&to->syncp);
+ }
+ #else /* CONFIG_BLK_CGROUP */
+ struct blkcg {
+ };
+ struct blkg_policy_data {
+ };
++struct blkcg_policy_data {
++};
++
+ struct blkcg_gq {
+ };
+ struct blkcg_policy {
+ };
+ #define blkcg_root_css        ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+ static inline struct cgroup_subsys_state *
+ task_get_blkcg_css(struct task_struct *task)
+ {
+       return NULL;
+ }
+ #ifdef CONFIG_BLOCK
+ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+ static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+ static inline void blkcg_drain_queue(struct request_queue *q) { }
+ static inline void blkcg_exit_queue(struct request_queue *q) { }
+ static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+ static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+ static inline int blkcg_activate_policy(struct request_queue *q,
+                                       const struct blkcg_policy *pol) { return 0; }
+ static inline void blkcg_deactivate_policy(struct request_queue *q,
+                                          const struct blkcg_policy *pol) { }
+ static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+                                                 struct blkcg_policy *pol) { return NULL; }
+ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+ static inline void blkg_get(struct blkcg_gq *blkg) { }
+ static inline void blkg_put(struct blkcg_gq *blkg) { }
+ static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio) { return &q->root_rl; }
+ static inline void blk_put_rl(struct request_list *rl) { }
+ static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+ #define blk_queue_for_each_rl(rl, q)  \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+ #endif        /* CONFIG_BLOCK */
+ #endif        /* CONFIG_BLK_CGROUP */
+ #endif        /* _BLK_CGROUP_H */
diff --combined include/linux/blkdev.h
index 5ced29cef03f7b01819019e7c34cbbc1b2b549a5,ab4a27852f1bdaf836825def801e727de3e8fe50..7f2f54b4587f78e17536c9197fe8172b566f8802
@@@ -12,7 -12,7 +12,7 @@@
  #include <linux/timer.h>
  #include <linux/workqueue.h>
  #include <linux/pagemap.h>
- #include <linux/backing-dev.h>
+ #include <linux/backing-dev-defs.h>
  #include <linux/wait.h>
  #include <linux/mempool.h>
  #include <linux/bio.h>
@@@ -22,7 -22,8 +22,7 @@@
  #include <linux/smp.h>
  #include <linux/rcupdate.h>
  #include <linux/percpu-refcount.h>
 -
 -#include <asm/scatterlist.h>
 +#include <linux/scatterlist.h>
  
  struct module;
  struct scsi_ioctl_command;
@@@ -787,25 -788,8 +787,6 @@@ extern int scsi_cmd_ioctl(struct reques
  extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                         struct scsi_ioctl_command __user *);
  
- /*
-  * A queue has just exitted congestion.  Note this in the global counter of
-  * congested queues, and wake up anyone who was waiting for requests to be
-  * put back.
-  */
- static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
- {
-       clear_bdi_congested(&q->backing_dev_info, sync);
- }
- /*
-  * A queue has just entered congestion.  Flag that in the queue's VM-visible
-  * state flags and increment the global gounter of congested queues.
-  */
- static inline void blk_set_queue_congested(struct request_queue *q, int sync)
- {
-       set_bdi_congested(&q->backing_dev_info, sync);
- }
 -extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
--
  extern void blk_start_queue(struct request_queue *q);
  extern void blk_stop_queue(struct request_queue *q);
  extern void blk_sync_queue(struct request_queue *q);
@@@ -1021,7 -1005,6 +1002,7 @@@ bool __must_check blk_get_queue(struct 
  struct request_queue *blk_alloc_queue(gfp_t);
  struct request_queue *blk_alloc_queue_node(gfp_t, int);
  extern void blk_put_queue(struct request_queue *);
 +extern void blk_set_queue_dying(struct request_queue *);
  
  /*
   * block layer runtime pm functions
diff --combined include/linux/fs.h
index 5db7b1379d174848116124b5f9c26fc212798d21,2c5e33a5b2af4af5934597b082452858d7717f28..e351da4a934f415b4ba0d2cc84acaa52c8120614
  #include <uapi/linux/fs.h>
  
  struct backing_dev_info;
+ struct bdi_writeback;
  struct export_operations;
  struct hd_geometry;
  struct iovec;
 -struct nameidata;
  struct kiocb;
  struct kobject;
  struct pipe_inode_info;
@@@ -634,6 -636,14 +635,14 @@@ struct inode 
  
        struct hlist_node       i_hash;
        struct list_head        i_wb_list;      /* backing dev IO list */
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct bdi_writeback    *i_wb;          /* the associated cgroup wb */
+       /* foreign inode detection, see wbc_detach_inode() */
+       int                     i_wb_frn_winner;
+       u16                     i_wb_frn_avg_time;
+       u16                     i_wb_frn_history;
+ #endif
        struct list_head        i_lru;          /* inode LRU list */
        struct list_head        i_sb_list;
        union {
                struct pipe_inode_info  *i_pipe;
                struct block_device     *i_bdev;
                struct cdev             *i_cdev;
 +              char                    *i_link;
        };
  
        __u32                   i_generation;
@@@ -1232,6 -1241,8 +1241,8 @@@ struct mm_struct
  #define UMOUNT_NOFOLLOW       0x00000008      /* Don't follow symlink on umount */
  #define UMOUNT_UNUSED 0x80000000      /* Flag guaranteed to be unused */
  
+ /* sb->s_iflags */
+ #define SB_I_CGROUPWB 0x00000001      /* cgroup-aware writeback enabled */
  
  /* Possible states of 'frozen' field */
  enum {
@@@ -1270,6 -1281,7 +1281,7 @@@ struct super_block 
        const struct quotactl_ops       *s_qcop;
        const struct export_operations *s_export_op;
        unsigned long           s_flags;
+       unsigned long           s_iflags;       /* internal SB_I_* flags */
        unsigned long           s_magic;
        struct dentry           *s_root;
        struct rw_semaphore     s_umount;
@@@ -1607,12 -1619,12 +1619,12 @@@ struct file_operations 
  
  struct inode_operations {
        struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
 -      void * (*follow_link) (struct dentry *, struct nameidata *);
 +      const char * (*follow_link) (struct dentry *, void **);
        int (*permission) (struct inode *, int);
        struct posix_acl * (*get_acl)(struct inode *, int);
  
        int (*readlink) (struct dentry *, char __user *,int);
 -      void (*put_link) (struct dentry *, struct nameidata *, void *);
 +      void (*put_link) (struct inode *, void *);
  
        int (*create) (struct inode *,struct dentry *, umode_t, bool);
        int (*link) (struct dentry *,struct inode *,struct dentry *);
@@@ -1806,6 -1818,11 +1818,11 @@@ struct super_operations 
   *
   * I_DIO_WAKEUP               Never set.  Only used as a key for wait_on_bit().
   *
+  * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
+  *                    synchronize competing switching instances and to tell
+  *                    wb stat updates to grab mapping->tree_lock.  See
+  *                    inode_switch_wb_work_fn() for details.
+  *
   * Q: What is the difference between I_WILL_FREE and I_FREEING?
   */
  #define I_DIRTY_SYNC          (1 << 0)
  #define I_DIRTY_TIME          (1 << 11)
  #define __I_DIRTY_TIME_EXPIRED        12
  #define I_DIRTY_TIME_EXPIRED  (1 << __I_DIRTY_TIME_EXPIRED)
+ #define I_WB_SWITCH           (1 << 13)
  
  #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
  #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
@@@ -1879,7 -1897,6 +1897,7 @@@ enum file_time_flags 
        S_VERSION = 8,
  };
  
 +extern bool atime_needs_update(const struct path *, struct inode *);
  extern void touch_atime(const struct path *);
  static inline void file_accessed(struct file *file)
  {
@@@ -2241,7 -2258,13 +2259,13 @@@ extern struct super_block *freeze_bdev(
  extern void emergency_thaw_all(void);
  extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
  extern int fsync_bdev(struct block_device *);
- extern int sb_is_blkdev_sb(struct super_block *sb);
+ extern struct super_block *blockdev_superblock;
+ static inline bool sb_is_blkdev_sb(struct super_block *sb)
+ {
+       return sb == blockdev_superblock;
+ }
  #else
  static inline void bd_forget(struct inode *inode) {}
  static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@@ -2708,14 -2731,13 +2732,14 @@@ extern const struct file_operations gen
  
  extern int readlink_copy(char __user *, int, const char *);
  extern int page_readlink(struct dentry *, char __user *, int);
 -extern void *page_follow_link_light(struct dentry *, struct nameidata *);
 -extern void page_put_link(struct dentry *, struct nameidata *, void *);
 +extern const char *page_follow_link_light(struct dentry *, void **);
 +extern void page_put_link(struct inode *, void *);
  extern int __page_symlink(struct inode *inode, const char *symname, int len,
                int nofs);
  extern int page_symlink(struct inode *inode, const char *symname, int len);
  extern const struct inode_operations page_symlink_inode_operations;
 -extern void kfree_put_link(struct dentry *, struct nameidata *, void *);
 +extern void kfree_put_link(struct inode *, void *);
 +extern void free_page_put_link(struct inode *, void *);
  extern int generic_readlink(struct dentry *, char __user *, int);
  extern void generic_fillattr(struct inode *, struct kstat *);
  int vfs_getattr_nosec(struct path *path, struct kstat *stat);
@@@ -2726,8 -2748,6 +2750,8 @@@ void __inode_sub_bytes(struct inode *in
  void inode_sub_bytes(struct inode *inode, loff_t bytes);
  loff_t inode_get_bytes(struct inode *inode);
  void inode_set_bytes(struct inode *inode, loff_t bytes);
 +const char *simple_follow_link(struct dentry *, void **);
 +extern const struct inode_operations simple_symlink_inode_operations;
  
  extern int iterate_dir(struct file *, struct dir_context *);
  
index 6c8918114804fda89d00ed3e6b1482539f2dd4ee,c3eb19e2bc1c43da6d4738fb2bf2daf78e271517..73b02b0a8f609ac757de6ee59b23bcf8b0e87396
@@@ -41,6 -41,7 +41,7 @@@ enum mem_cgroup_stat_index 
        MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
        MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
        MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
+       MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
        MEM_CGROUP_STAT_WRITEBACK,      /* # of pages under writeback */
        MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
        MEM_CGROUP_STAT_NSTATS,
@@@ -67,6 -68,8 +68,8 @@@ enum mem_cgroup_events_index 
  };
  
  #ifdef CONFIG_MEMCG
+ extern struct cgroup_subsys_state *mem_cgroup_root_css;
  void mem_cgroup_events(struct mem_cgroup *memcg,
                       enum mem_cgroup_events_index idx,
                       unsigned int nr);
@@@ -112,6 -115,7 +115,7 @@@ static inline bool mm_match_cgroup(stru
  }
  
  extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
+ extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
  
  struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                   struct mem_cgroup *,
@@@ -195,6 -199,8 +199,8 @@@ void mem_cgroup_split_huge_fixup(struc
  #else /* CONFIG_MEMCG */
  struct mem_cgroup;
  
+ #define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
  static inline void mem_cgroup_events(struct mem_cgroup *memcg,
                                     enum mem_cgroup_events_index idx,
                                     unsigned int nr)
@@@ -382,6 -388,29 +388,29 @@@ enum 
        OVER_LIMIT,
  };
  
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
+ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback);
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+ {
+       return NULL;
+ }
+ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
+                                      unsigned long *pavail,
+                                      unsigned long *pdirty,
+                                      unsigned long *pwriteback)
+ {
+ }
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
  struct sock;
  #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
  void sock_update_memcg(struct sock *sk);
@@@ -463,8 -492,6 +492,8 @@@ memcg_kmem_newpage_charge(gfp_t gfp, st
        if (!memcg_kmem_enabled())
                return true;
  
 +      if (gfp & __GFP_NOACCOUNT)
 +              return true;
        /*
         * __GFP_NOFAIL allocations will move on even if charging is not
         * possible. Therefore we don't even try, and have this allocation
@@@ -524,8 -551,6 +553,8 @@@ memcg_kmem_get_cache(struct kmem_cache 
  {
        if (!memcg_kmem_enabled())
                return cachep;
 +      if (gfp & __GFP_NOACCOUNT)
 +              return cachep;
        if (gfp & __GFP_NOFAIL)
                return cachep;
        if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
diff --combined include/linux/mm.h
index 24ad583596d1219b4ec1111e5aea3045230ee650,4024543b4203eb86d29ef8b162289efd1a775f7a..99959a34f4f15e6d66b8a6681b256134634164ee
@@@ -27,6 -27,7 +27,7 @@@ struct anon_vma_chain
  struct file_ra_state;
  struct user_struct;
  struct writeback_control;
+ struct bdi_writeback;
  
  #ifndef CONFIG_NEED_MULTIPLE_NODES    /* Don't use mapnrs, do it properly */
  extern unsigned long max_mapnr;
@@@ -499,7 -500,7 +500,7 @@@ static inline int page_count(struct pag
  
  static inline bool __compound_tail_refcounted(struct page *page)
  {
 -      return !PageSlab(page) && !PageHeadHuge(page);
 +      return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
  }
  
  /*
@@@ -1211,10 -1212,13 +1212,13 @@@ int __set_page_dirty_nobuffers(struct p
  int __set_page_dirty_no_writeback(struct page *page);
  int redirty_page_for_writepage(struct writeback_control *wbc,
                                struct page *page);
- void account_page_dirtied(struct page *page, struct address_space *mapping);
- void account_page_cleaned(struct page *page, struct address_space *mapping);
+ void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg);
+ void account_page_cleaned(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg, struct bdi_writeback *wb);
  int set_page_dirty(struct page *page);
  int set_page_dirty_lock(struct page *page);
+ void cancel_dirty_page(struct page *page);
  int clear_page_dirty_for_io(struct page *page);
  
  int get_cmdline(struct task_struct *task, char *buffer, int buflen);
@@@ -2146,47 -2150,12 +2150,47 @@@ enum mf_flags 
  extern int memory_failure(unsigned long pfn, int trapno, int flags);
  extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
  extern int unpoison_memory(unsigned long pfn);
 +extern int get_hwpoison_page(struct page *page);
  extern int sysctl_memory_failure_early_kill;
  extern int sysctl_memory_failure_recovery;
  extern void shake_page(struct page *p, int access);
  extern atomic_long_t num_poisoned_pages;
  extern int soft_offline_page(struct page *page, int flags);
  
 +
 +/*
 + * Error handlers for various types of pages.
 + */
 +enum mf_result {
 +      MF_IGNORED,     /* Error: cannot be handled */
 +      MF_FAILED,      /* Error: handling failed */
 +      MF_DELAYED,     /* Will be handled later */
 +      MF_RECOVERED,   /* Successfully recovered */
 +};
 +
 +enum mf_action_page_type {
 +      MF_MSG_KERNEL,
 +      MF_MSG_KERNEL_HIGH_ORDER,
 +      MF_MSG_SLAB,
 +      MF_MSG_DIFFERENT_COMPOUND,
 +      MF_MSG_POISONED_HUGE,
 +      MF_MSG_HUGE,
 +      MF_MSG_FREE_HUGE,
 +      MF_MSG_UNMAP_FAILED,
 +      MF_MSG_DIRTY_SWAPCACHE,
 +      MF_MSG_CLEAN_SWAPCACHE,
 +      MF_MSG_DIRTY_MLOCKED_LRU,
 +      MF_MSG_CLEAN_MLOCKED_LRU,
 +      MF_MSG_DIRTY_UNEVICTABLE_LRU,
 +      MF_MSG_CLEAN_UNEVICTABLE_LRU,
 +      MF_MSG_DIRTY_LRU,
 +      MF_MSG_CLEAN_LRU,
 +      MF_MSG_TRUNCATED_LRU,
 +      MF_MSG_BUDDY,
 +      MF_MSG_BUDDY_2ND,
 +      MF_MSG_UNKNOWN,
 +};
 +
  #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
  extern void clear_huge_page(struct page *page,
                            unsigned long addr,
index c178d13d6f4c0cb51d441c59e7b4975a1913ed3e,bec69995968f2f0d4a0025f386e66c6a4699ec47..a7aa607a4c55e51ec8ba8a6593828604e31a7aac
@@@ -250,6 -250,7 +250,6 @@@ DEFINE_EVENT(writeback_class, name, 
  DEFINE_WRITEBACK_EVENT(writeback_nowork);
  DEFINE_WRITEBACK_EVENT(writeback_wake_background);
  DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
 -DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
  
  DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@@ -360,7 -361,7 +360,7 @@@ TRACE_EVENT(global_dirty_state
                __entry->nr_written     = global_page_state(NR_WRITTEN);
                __entry->background_thresh = background_thresh;
                __entry->dirty_thresh   = dirty_thresh;
-               __entry->dirty_limit = global_dirty_limit;
+               __entry->dirty_limit    = global_wb_domain.dirty_limit;
        ),
  
        TP_printk("dirty=%lu writeback=%lu unstable=%lu "
@@@ -399,13 -400,13 +399,13 @@@ TRACE_EVENT(bdi_dirty_ratelimit
  
        TP_fast_assign(
                strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-               __entry->write_bw       = KBps(bdi->write_bandwidth);
-               __entry->avg_write_bw   = KBps(bdi->avg_write_bandwidth);
+               __entry->write_bw       = KBps(bdi->wb.write_bandwidth);
+               __entry->avg_write_bw   = KBps(bdi->wb.avg_write_bandwidth);
                __entry->dirty_rate     = KBps(dirty_rate);
-               __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
+               __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
                __entry->task_ratelimit = KBps(task_ratelimit);
                __entry->balanced_dirty_ratelimit =
-                                         KBps(bdi->balanced_dirty_ratelimit);
+                                       KBps(bdi->wb.balanced_dirty_ratelimit);
        ),
  
        TP_printk("bdi %s: "
@@@ -462,8 -463,9 +462,9 @@@ TRACE_EVENT(balance_dirty_pages
                unsigned long freerun = (thresh + bg_thresh) / 2;
                strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
  
-               __entry->limit          = global_dirty_limit;
-               __entry->setpoint       = (global_dirty_limit + freerun) / 2;
+               __entry->limit          = global_wb_domain.dirty_limit;
+               __entry->setpoint       = (global_wb_domain.dirty_limit +
+                                               freerun) / 2;
                __entry->dirty          = dirty;
                __entry->bdi_setpoint   = __entry->setpoint *
                                                bdi_thresh / (thresh + 1);
diff --combined init/Kconfig
index b999fa381bf9fe1f37757af5e0a454cc6adb2da9,d4f763332f9f4a2c2417709194ea60026b37b4db..7260b27ebbabeb4537bc747cc604858e070baf00
@@@ -465,9 -465,13 +465,9 @@@ endmenu # "CPU/Task time and stats acco
  
  menu "RCU Subsystem"
  
 -choice
 -      prompt "RCU Implementation"
 -      default TREE_RCU
 -
  config TREE_RCU
 -      bool "Tree-based hierarchical RCU"
 -      depends on !PREEMPT && SMP
 +      bool
 +      default y if !PREEMPT && SMP
        help
          This option selects the RCU implementation that is
          designed for very large SMP system with hundreds or
          smaller systems.
  
  config PREEMPT_RCU
 -      bool "Preemptible tree-based hierarchical RCU"
 -      depends on PREEMPT
 +      bool
 +      default y if PREEMPT
        help
          This option selects the RCU implementation that is
          designed for very large SMP systems with hundreds or
          Select this option if you are unsure.
  
  config TINY_RCU
 -      bool "UP-only small-memory-footprint RCU"
 -      depends on !PREEMPT && !SMP
 +      bool
 +      default y if !PREEMPT && !SMP
        help
          This option selects the RCU implementation that is
          designed for UP systems from which real-time response
          is not required.  This option greatly reduces the
          memory footprint of RCU.
  
 -endchoice
 +config RCU_EXPERT
 +      bool "Make expert-level adjustments to RCU configuration"
 +      default n
 +      help
 +        This option needs to be enabled if you wish to make
 +        expert-level adjustments to RCU configuration.  By default,
 +        no such adjustments can be made, which has the often-beneficial
 +        side-effect of preventing "make oldconfig" from asking you all
 +        sorts of detailed questions about how you would like numerous
 +        obscure RCU options to be set up.
 +
 +        Say Y if you need to make expert-level adjustments to RCU.
 +
 +        Say N if you are unsure.
  
  config SRCU
        bool
          sections.
  
  config TASKS_RCU
 -      bool "Task_based RCU implementation using voluntary context switch"
 +      bool
        default n
        select SRCU
        help
          only voluntary context switch (not preemption!), idle, and
          user-mode execution as quiescent states.
  
 -        If unsure, say N.
 -
  config RCU_STALL_COMMON
        def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
        help
@@@ -538,7 -531,9 +538,7 @@@ config CONTEXT_TRACKIN
         bool
  
  config RCU_USER_QS
 -      bool "Consider userspace as in RCU extended quiescent state"
 -      depends on HAVE_CONTEXT_TRACKING && SMP
 -      select CONTEXT_TRACKING
 +      bool
        help
          This option sets hooks on kernel / userspace boundaries and
          puts RCU in extended quiescent state when the CPU runs in
          excluded from the global RCU state machine and thus doesn't
          try to keep the timer tick on for RCU.
  
 -        Unless you want to hack and help the development of the full
 -        dynticks mode, you shouldn't enable this option.  It also
 -        adds unnecessary overhead.
 -
 -        If unsure say N
 -
  config CONTEXT_TRACKING_FORCE
        bool "Force context tracking"
        depends on CONTEXT_TRACKING
@@@ -577,7 -578,7 +577,7 @@@ config RCU_FANOU
        int "Tree-based hierarchical RCU fanout value"
        range 2 64 if 64BIT
        range 2 32 if !64BIT
 -      depends on TREE_RCU || PREEMPT_RCU
 +      depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
        default 64 if 64BIT
        default 32 if !64BIT
        help
  
  config RCU_FANOUT_LEAF
        int "Tree-based hierarchical RCU leaf-level fanout value"
 -      range 2 RCU_FANOUT if 64BIT
 -      range 2 RCU_FANOUT if !64BIT
 -      depends on TREE_RCU || PREEMPT_RCU
 +      range 2 64 if 64BIT
 +      range 2 32 if !64BIT
 +      depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
        default 16
        help
          This option controls the leaf-level fanout of hierarchical
  
          Take the default if unsure.
  
 -config RCU_FANOUT_EXACT
 -      bool "Disable tree-based hierarchical RCU auto-balancing"
 -      depends on TREE_RCU || PREEMPT_RCU
 -      default n
 -      help
 -        This option forces use of the exact RCU_FANOUT value specified,
 -        regardless of imbalances in the hierarchy.  This is useful for
 -        testing RCU itself, and might one day be useful on systems with
 -        strong NUMA behavior.
 -
 -        Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
 -
 -        Say N if unsure.
 -
  config RCU_FAST_NO_HZ
        bool "Accelerate last non-dyntick-idle CPU's grace periods"
 -      depends on NO_HZ_COMMON && SMP
 +      depends on NO_HZ_COMMON && SMP && RCU_EXPERT
        default n
        help
          This option permits CPUs to enter dynticks-idle state even if
@@@ -648,7 -663,7 +648,7 @@@ config TREE_RCU_TRAC
  
  config RCU_BOOST
        bool "Enable RCU priority boosting"
 -      depends on RT_MUTEXES && PREEMPT_RCU
 +      depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
        default n
        help
          This option boosts the priority of preempted RCU readers that
@@@ -665,7 -680,6 +665,7 @@@ config RCU_KTHREAD_PRI
        range 0 99 if !RCU_BOOST
        default 1 if RCU_BOOST
        default 0 if !RCU_BOOST
 +      depends on RCU_EXPERT
        help
          This option specifies the SCHED_FIFO priority value that will be
          assigned to the rcuc/n and rcub/n threads and is also the value
@@@ -1127,6 -1141,11 +1127,11 @@@ config DEBUG_BLK_CGROU
        Enable some debugging help. Currently it exports additional stat
        files in a cgroup which can be useful for debugging.
  
+ config CGROUP_WRITEBACK
+       bool
+       depends on MEMCG && BLK_CGROUP
+       default y
  endif # CGROUPS
  
  config CHECKPOINT_RESTORE
@@@ -1623,7 -1642,7 +1628,7 @@@ config PERF_EVENT
  config DEBUG_PERF_USE_VMALLOC
        default n
        bool "Debug: use vmalloc to back perf mmap() buffers"
 -      depends on PERF_EVENTS && DEBUG_KERNEL
 +      depends on PERF_EVENTS && DEBUG_KERNEL && !PPC
        select PERF_USE_VMALLOC
        help
         Use vmalloc memory to back perf mmap() buffers.
diff --combined mm/backing-dev.c
index 000e7b3b9896f2a9479687befd2442c43193614e,436bb53dd383f24380b11a44d6f3e39c42c39fbb..7756da31b02bcbb2a7f7036a4bbbdd093883ad6c
@@@ -18,6 -18,7 +18,7 @@@ struct backing_dev_info noop_backing_de
        .name           = "noop",
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
  };
+ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
  
  static struct class *bdi_class;
  
@@@ -48,7 -49,7 +49,7 @@@ static int bdi_debug_stats_show(struct 
        struct bdi_writeback *wb = &bdi->wb;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
-       unsigned long bdi_thresh;
+       unsigned long wb_thresh;
        unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
        struct inode *inode;
  
@@@ -66,7 -67,7 +67,7 @@@
        spin_unlock(&wb->list_lock);
  
        global_dirty_limits(&background_thresh, &dirty_thresh);
-       bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+       wb_thresh = wb_calc_thresh(wb, dirty_thresh);
  
  #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
                   "b_dirty_time:       %10lu\n"
                   "bdi_list:           %10u\n"
                   "state:              %10lx\n",
-                  (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-                  (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-                  K(bdi_thresh),
+                  (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+                  (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
+                  K(wb_thresh),
                   K(dirty_thresh),
                   K(background_thresh),
-                  (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
-                  (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
-                  (unsigned long) K(bdi->write_bandwidth),
+                  (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+                  (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
+                  (unsigned long) K(wb->write_bandwidth),
                   nr_dirty,
                   nr_io,
                   nr_more_io,
                   nr_dirty_time,
-                  !list_empty(&bdi->bdi_list), bdi->state);
+                  !list_empty(&bdi->bdi_list), bdi->wb.state);
  #undef K
  
        return 0;
@@@ -255,13 -256,8 +256,8 @@@ static int __init default_bdi_init(void
  }
  subsys_initcall(default_bdi_init);
  
- int bdi_has_dirty_io(struct backing_dev_info *bdi)
- {
-       return wb_has_dirty_io(&bdi->wb);
- }
  /*
-  * This function is used when the first inode for this bdi is marked dirty. It
+  * This function is used when the first inode for this wb is marked dirty. It
   * wakes-up the corresponding bdi thread which should then take care of the
   * periodic background write-out of dirty inodes. Since the write-out would
   * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
   * We have to be careful not to postpone flush work if it is scheduled for
   * earlier. Thus we use queue_delayed_work().
   */
- void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+ void wb_wakeup_delayed(struct bdi_writeback *wb)
  {
        unsigned long timeout;
  
        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-       spin_lock_bh(&bdi->wb_lock);
-       if (test_bit(BDI_registered, &bdi->state))
-               queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_lock_bh(&wb->work_lock);
+       if (test_bit(WB_registered, &wb->state))
+               queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+       spin_unlock_bh(&wb->work_lock);
  }
  
  /*
-  * Remove bdi from bdi_list, and ensure that it is no longer visible
+  * Initial write bandwidth: 100 MB/s
   */
- static void bdi_remove_from_list(struct backing_dev_info *bdi)
- {
-       spin_lock_bh(&bdi_lock);
-       list_del_rcu(&bdi->bdi_list);
-       spin_unlock_bh(&bdi_lock);
-       synchronize_rcu_expedited();
- }
+ #define INIT_BW               (100 << (20 - PAGE_SHIFT))
  
int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-               const char *fmt, ...)
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+                  gfp_t gfp)
  {
-       va_list args;
-       struct device *dev;
+       int i, err;
  
-       if (bdi->dev)   /* The driver needs to use separate queues per device */
-               return 0;
+       memset(wb, 0, sizeof(*wb));
  
-       va_start(args, fmt);
-       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-       va_end(args);
-       if (IS_ERR(dev))
-               return PTR_ERR(dev);
+       wb->bdi = bdi;
+       wb->last_old_flush = jiffies;
+       INIT_LIST_HEAD(&wb->b_dirty);
+       INIT_LIST_HEAD(&wb->b_io);
+       INIT_LIST_HEAD(&wb->b_more_io);
+       INIT_LIST_HEAD(&wb->b_dirty_time);
+       spin_lock_init(&wb->list_lock);
  
-       bdi->dev = dev;
+       wb->bw_time_stamp = jiffies;
+       wb->balanced_dirty_ratelimit = INIT_BW;
+       wb->dirty_ratelimit = INIT_BW;
+       wb->write_bandwidth = INIT_BW;
+       wb->avg_write_bandwidth = INIT_BW;
  
-       bdi_debug_register(bdi, dev_name(dev));
-       set_bit(BDI_registered, &bdi->state);
+       spin_lock_init(&wb->work_lock);
+       INIT_LIST_HEAD(&wb->work_list);
+       INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
  
-       spin_lock_bh(&bdi_lock);
-       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
-       spin_unlock_bh(&bdi_lock);
+       err = fprop_local_init_percpu(&wb->completions, gfp);
+       if (err)
+               return err;
  
-       trace_writeback_bdi_register(bdi);
-       return 0;
- }
- EXPORT_SYMBOL(bdi_register);
+       for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+               err = percpu_counter_init(&wb->stat[i], 0, gfp);
+               if (err) {
+                       while (--i)
+                               percpu_counter_destroy(&wb->stat[i]);
+                       fprop_local_destroy_percpu(&wb->completions);
+                       return err;
+               }
+       }
  
- int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
- {
-       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+       return 0;
  }
- EXPORT_SYMBOL(bdi_register_dev);
  
  /*
   * Remove bdi from the global list and shutdown any threads we have running
   */
- static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+ static void wb_shutdown(struct bdi_writeback *wb)
  {
        /* Make sure nobody queues further work */
-       spin_lock_bh(&bdi->wb_lock);
-       if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
-               spin_unlock_bh(&bdi->wb_lock);
+       spin_lock_bh(&wb->work_lock);
+       if (!test_and_clear_bit(WB_registered, &wb->state)) {
+               spin_unlock_bh(&wb->work_lock);
                return;
        }
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_unlock_bh(&wb->work_lock);
  
        /*
-        * Make sure nobody finds us on the bdi_list anymore
+        * Drain work list and shutdown the delayed_work.  !WB_registered
+        * tells wb_workfn() that @wb is dying and its work_list needs to
+        * be drained no matter what.
         */
-       bdi_remove_from_list(bdi);
+       mod_delayed_work(bdi_wq, &wb->dwork, 0);
+       flush_delayed_work(&wb->dwork);
+       WARN_ON(!list_empty(&wb->work_list));
+ }
+ static void wb_exit(struct bdi_writeback *wb)
+ {
+       int i;
+       WARN_ON(delayed_work_pending(&wb->dwork));
+       for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+               percpu_counter_destroy(&wb->stat[i]);
+       fprop_local_destroy_percpu(&wb->completions);
+ }
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ #include <linux/memcontrol.h>
+ /*
+  * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
+  * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
+  * protected.  cgwb_release_wait is used to wait for the completion of cgwb
+  * releases from bdi destruction path.
+  */
+ static DEFINE_SPINLOCK(cgwb_lock);
+ static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
+ /**
+  * wb_congested_get_create - get or create a wb_congested
+  * @bdi: associated bdi
+  * @blkcg_id: ID of the associated blkcg
+  * @gfp: allocation mask
+  *
+  * Look up the wb_congested for @blkcg_id on @bdi.  If missing, create one.
+  * The returned wb_congested has its reference count incremented.  Returns
+  * NULL on failure.
+  */
+ struct bdi_writeback_congested *
+ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+ {
+       struct bdi_writeback_congested *new_congested = NULL, *congested;
+       struct rb_node **node, *parent;
+       unsigned long flags;
+       if (blkcg_id == 1)
+               return &bdi->wb_congested;
+ retry:
+       spin_lock_irqsave(&cgwb_lock, flags);
+       node = &bdi->cgwb_congested_tree.rb_node;
+       parent = NULL;
+       while (*node != NULL) {
+               parent = *node;
+               congested = container_of(parent, struct bdi_writeback_congested,
+                                        rb_node);
+               if (congested->blkcg_id < blkcg_id)
+                       node = &parent->rb_left;
+               else if (congested->blkcg_id > blkcg_id)
+                       node = &parent->rb_right;
+               else
+                       goto found;
+       }
+       if (new_congested) {
+               /* !found and storage for new one already allocated, insert */
+               congested = new_congested;
+               new_congested = NULL;
+               rb_link_node(&congested->rb_node, parent, node);
+               rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
+               atomic_inc(&bdi->usage_cnt);
+               goto found;
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       /* allocate storage for new one and retry */
+       new_congested = kzalloc(sizeof(*new_congested), gfp);
+       if (!new_congested)
+               return NULL;
+       atomic_set(&new_congested->refcnt, 0);
+       new_congested->bdi = bdi;
+       new_congested->blkcg_id = blkcg_id;
+       goto retry;
+ found:
+       atomic_inc(&congested->refcnt);
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       kfree(new_congested);
+       return congested;
+ }
+ /**
+  * wb_congested_put - put a wb_congested
+  * @congested: wb_congested to put
+  *
+  * Put @congested and destroy it if the refcnt reaches zero.
+  */
+ void wb_congested_put(struct bdi_writeback_congested *congested)
+ {
+       struct backing_dev_info *bdi = congested->bdi;
+       unsigned long flags;
+       if (congested->blkcg_id == 1)
+               return;
+       local_irq_save(flags);
+       if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+               local_irq_restore(flags);
+               return;
+       }
+       rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       kfree(congested);
+       if (atomic_dec_and_test(&bdi->usage_cnt))
+               wake_up_all(&cgwb_release_wait);
+ }
+ static void cgwb_release_workfn(struct work_struct *work)
+ {
+       struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+                                               release_work);
+       struct backing_dev_info *bdi = wb->bdi;
+       wb_shutdown(wb);
+       css_put(wb->memcg_css);
+       css_put(wb->blkcg_css);
+       wb_congested_put(wb->congested);
+       fprop_local_destroy_percpu(&wb->memcg_completions);
+       percpu_ref_exit(&wb->refcnt);
+       wb_exit(wb);
+       kfree_rcu(wb, rcu);
+       if (atomic_dec_and_test(&bdi->usage_cnt))
+               wake_up_all(&cgwb_release_wait);
+ }
+ static void cgwb_release(struct percpu_ref *refcnt)
+ {
+       struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+                                               refcnt);
+       schedule_work(&wb->release_work);
+ }
+ static void cgwb_kill(struct bdi_writeback *wb)
+ {
+       lockdep_assert_held(&cgwb_lock);
+       WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+       list_del(&wb->memcg_node);
+       list_del(&wb->blkcg_node);
+       percpu_ref_kill(&wb->refcnt);
+ }
+ static int cgwb_create(struct backing_dev_info *bdi,
+                      struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+ {
+       struct mem_cgroup *memcg;
+       struct cgroup_subsys_state *blkcg_css;
+       struct blkcg *blkcg;
+       struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+       struct bdi_writeback *wb;
+       unsigned long flags;
+       int ret = 0;
+       memcg = mem_cgroup_from_css(memcg_css);
+       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+       blkcg = css_to_blkcg(blkcg_css);
+       memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+       blkcg_cgwb_list = &blkcg->cgwb_list;
+       /* look up again under lock and discard on blkcg mismatch */
+       spin_lock_irqsave(&cgwb_lock, flags);
+       wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+       if (wb && wb->blkcg_css != blkcg_css) {
+               cgwb_kill(wb);
+               wb = NULL;
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       if (wb)
+               goto out_put;
+       /* need to create a new one */
+       wb = kmalloc(sizeof(*wb), gfp);
+       if (!wb)
+               return -ENOMEM;
+       ret = wb_init(wb, bdi, gfp);
+       if (ret)
+               goto err_free;
+       ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+       if (ret)
+               goto err_wb_exit;
+       ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+       if (ret)
+               goto err_ref_exit;
+       wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
+       if (!wb->congested) {
+               ret = -ENOMEM;
+               goto err_fprop_exit;
+       }
+       wb->memcg_css = memcg_css;
+       wb->blkcg_css = blkcg_css;
+       INIT_WORK(&wb->release_work, cgwb_release_workfn);
+       set_bit(WB_registered, &wb->state);
  
        /*
-        * Drain work list and shutdown the delayed_work.  At this point,
-        * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
-        * is dying and its work_list needs to be drained no matter what.
+        * The root wb determines the registered state of the whole bdi and
+        * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+        * whether they're still online.  Don't link @wb if any is dead.
+        * See wb_memcg_offline() and wb_blkcg_offline().
         */
-       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-       flush_delayed_work(&bdi->wb.dwork);
+       ret = -ENODEV;
+       spin_lock_irqsave(&cgwb_lock, flags);
+       if (test_bit(WB_registered, &bdi->wb.state) &&
+           blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+               /* we might have raced another instance of this function */
+               ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+               if (!ret) {
+                       atomic_inc(&bdi->usage_cnt);
+                       list_add(&wb->memcg_node, memcg_cgwb_list);
+                       list_add(&wb->blkcg_node, blkcg_cgwb_list);
+                       css_get(memcg_css);
+                       css_get(blkcg_css);
+               }
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       if (ret) {
+               if (ret == -EEXIST)
+                       ret = 0;
+               goto err_put_congested;
+       }
+       goto out_put;
+ err_put_congested:
+       wb_congested_put(wb->congested);
+ err_fprop_exit:
+       fprop_local_destroy_percpu(&wb->memcg_completions);
+ err_ref_exit:
+       percpu_ref_exit(&wb->refcnt);
+ err_wb_exit:
+       wb_exit(wb);
+ err_free:
+       kfree(wb);
+ out_put:
+       css_put(blkcg_css);
+       return ret;
  }
  
- static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+ /**
+  * wb_get_create - get wb for a given memcg, create if necessary
+  * @bdi: target bdi
+  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+  * @gfp: allocation mask to use
+  *
+  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
+  * create one.  The returned wb has its refcount incremented.
+  *
+  * This function uses css_get() on @memcg_css and thus expects its refcnt
+  * to be positive on invocation.  IOW, rcu_read_lock() protection on
+  * @memcg_css isn't enough.  try_get it before calling this function.
+  *
+  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
+  * memcg on the default hierarchy, memcg association is guaranteed to be
+  * more specific (equal or descendant to the associated blkcg) and thus can
+  * identify both the memcg and blkcg associations.
+  *
+  * Because the blkcg associated with a memcg may change as blkcg is enabled
+  * and disabled closer to root in the hierarchy, each wb keeps track of
+  * both the memcg and blkcg associated with it and verifies the blkcg on
+  * each lookup.  On mismatch, the existing wb is discarded and a new one is
+  * created.
+  */
+ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+                                   struct cgroup_subsys_state *memcg_css,
+                                   gfp_t gfp)
  {
-       memset(wb, 0, sizeof(*wb));
+       struct bdi_writeback *wb;
+       might_sleep_if(gfp & __GFP_WAIT);
+       if (!memcg_css->parent)
+               return &bdi->wb;
+       do {
+               rcu_read_lock();
+               wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+               if (wb) {
+                       struct cgroup_subsys_state *blkcg_css;
+                       /* see whether the blkcg association has changed */
+                       blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
+                                                    &blkio_cgrp_subsys);
+                       if (unlikely(wb->blkcg_css != blkcg_css ||
+                                    !wb_tryget(wb)))
+                               wb = NULL;
+                       css_put(blkcg_css);
+               }
+               rcu_read_unlock();
+       } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+       return wb;
+ }
  
-       wb->bdi = bdi;
-       wb->last_old_flush = jiffies;
-       INIT_LIST_HEAD(&wb->b_dirty);
-       INIT_LIST_HEAD(&wb->b_io);
-       INIT_LIST_HEAD(&wb->b_more_io);
-       INIT_LIST_HEAD(&wb->b_dirty_time);
-       spin_lock_init(&wb->list_lock);
-       INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+ static void cgwb_bdi_init(struct backing_dev_info *bdi)
+ {
+       bdi->wb.memcg_css = mem_cgroup_root_css;
+       bdi->wb.blkcg_css = blkcg_root_css;
+       bdi->wb_congested.blkcg_id = 1;
+       INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+       bdi->cgwb_congested_tree = RB_ROOT;
+       atomic_set(&bdi->usage_cnt, 1);
  }
  
- /*
-  * Initial write bandwidth: 100 MB/s
+ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+ {
+       struct radix_tree_iter iter;
+       void **slot;
+       WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+       spin_lock_irq(&cgwb_lock);
+       radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+               cgwb_kill(*slot);
+       spin_unlock_irq(&cgwb_lock);
+       /*
+        * All cgwb's and their congested states must be shutdown and
+        * released before returning.  Drain the usage counter to wait for
+        * all cgwb's and cgwb_congested's ever created on @bdi.
+        */
+       atomic_dec(&bdi->usage_cnt);
+       wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+ }
+ /**
+  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+  * @memcg: memcg being offlined
+  *
+  * Also prevents creation of any new wb's associated with @memcg.
   */
- #define INIT_BW               (100 << (20 - PAGE_SHIFT))
+ void wb_memcg_offline(struct mem_cgroup *memcg)
+ {
+       LIST_HEAD(to_destroy);
+       struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+       struct bdi_writeback *wb, *next;
+       spin_lock_irq(&cgwb_lock);
+       list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+               cgwb_kill(wb);
+       memcg_cgwb_list->next = NULL;   /* prevent new wb's */
+       spin_unlock_irq(&cgwb_lock);
+ }
+ /**
+  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+  * @blkcg: blkcg being offlined
+  *
+  * Also prevents creation of any new wb's associated with @blkcg.
+  */
+ void wb_blkcg_offline(struct blkcg *blkcg)
+ {
+       LIST_HEAD(to_destroy);
+       struct bdi_writeback *wb, *next;
+       spin_lock_irq(&cgwb_lock);
+       list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+               cgwb_kill(wb);
+       blkcg->cgwb_list.next = NULL;   /* prevent new wb's */
+       spin_unlock_irq(&cgwb_lock);
+ }
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
  
  int bdi_init(struct backing_dev_info *bdi)
  {
-       int i, err;
+       int err;
  
        bdi->dev = NULL;
  
        bdi->min_ratio = 0;
        bdi->max_ratio = 100;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
-       spin_lock_init(&bdi->wb_lock);
        INIT_LIST_HEAD(&bdi->bdi_list);
-       INIT_LIST_HEAD(&bdi->work_list);
+       init_waitqueue_head(&bdi->wb_waitq);
  
-       bdi_wb_init(&bdi->wb, bdi);
+       err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+       if (err)
+               return err;
  
-       for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-               err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
-               if (err)
-                       goto err;
-       }
+       bdi->wb_congested.state = 0;
+       bdi->wb.congested = &bdi->wb_congested;
  
-       bdi->dirty_exceeded = 0;
+       cgwb_bdi_init(bdi);
+       return 0;
+ }
+ EXPORT_SYMBOL(bdi_init);
  
-       bdi->bw_time_stamp = jiffies;
-       bdi->written_stamp = 0;
+ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+               const char *fmt, ...)
+ {
+       va_list args;
+       struct device *dev;
  
-       bdi->balanced_dirty_ratelimit = INIT_BW;
-       bdi->dirty_ratelimit = INIT_BW;
-       bdi->write_bandwidth = INIT_BW;
-       bdi->avg_write_bandwidth = INIT_BW;
+       if (bdi->dev)   /* The driver needs to use separate queues per device */
+               return 0;
  
-       err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+       va_start(args, fmt);
+       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+       va_end(args);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
  
-       if (err) {
- err:
-               while (i--)
-                       percpu_counter_destroy(&bdi->bdi_stat[i]);
-       }
+       bdi->dev = dev;
  
-       return err;
+       bdi_debug_register(bdi, dev_name(dev));
+       set_bit(WB_registered, &bdi->wb.state);
+       spin_lock_bh(&bdi_lock);
+       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+       spin_unlock_bh(&bdi_lock);
+       trace_writeback_bdi_register(bdi);
+       return 0;
  }
- EXPORT_SYMBOL(bdi_init);
+ EXPORT_SYMBOL(bdi_register);
  
void bdi_destroy(struct backing_dev_info *bdi)
int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
  {
-       int i;
+       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ }
+ EXPORT_SYMBOL(bdi_register_dev);
+ /*
+  * Remove bdi from bdi_list, and ensure that it is no longer visible
+  */
+ static void bdi_remove_from_list(struct backing_dev_info *bdi)
+ {
+       spin_lock_bh(&bdi_lock);
+       list_del_rcu(&bdi->bdi_list);
+       spin_unlock_bh(&bdi_lock);
  
-       bdi_wb_shutdown(bdi);
-       bdi_set_min_ratio(bdi, 0);
+       synchronize_rcu_expedited();
+ }
  
-       WARN_ON(!list_empty(&bdi->work_list));
-       WARN_ON(delayed_work_pending(&bdi->wb.dwork));
 -/*
 - * Called when the device behind @bdi has been removed or ejected.
 - *
 - * We can't really do much here except for reducing the dirty ratio at
 - * the moment.  In the future we should be able to set a flag so that
 - * the filesystem can handle errors at mark_inode_dirty time instead
 - * of only at writeback time.
 - */
 -void bdi_unregister(struct backing_dev_info *bdi)
 -{
 -      if (WARN_ON_ONCE(!bdi->dev))
 -              return;
 -
 -      bdi_set_min_ratio(bdi, 0);
 -}
 -EXPORT_SYMBOL(bdi_unregister);
 -
+ void bdi_destroy(struct backing_dev_info *bdi)
+ {
+       /* make sure nobody finds us on the bdi_list anymore */
+       bdi_remove_from_list(bdi);
+       wb_shutdown(&bdi->wb);
+       cgwb_bdi_destroy(bdi);
  
        if (bdi->dev) {
                bdi_debug_unregister(bdi);
                bdi->dev = NULL;
        }
  
-       for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
-               percpu_counter_destroy(&bdi->bdi_stat[i]);
-       fprop_local_destroy_percpu(&bdi->completions);
+       wb_exit(&bdi->wb);
  }
  EXPORT_SYMBOL(bdi_destroy);
  
@@@ -472,31 -871,31 +854,31 @@@ static wait_queue_head_t congestion_wqh
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
        };
- static atomic_t nr_bdi_congested[2];
+ static atomic_t nr_wb_congested[2];
  
- void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+ void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
  {
-       enum bdi_state bit;
        wait_queue_head_t *wqh = &congestion_wqh[sync];
+       enum wb_state bit;
  
-       bit = sync ? BDI_sync_congested : BDI_async_congested;
-       if (test_and_clear_bit(bit, &bdi->state))
-               atomic_dec(&nr_bdi_congested[sync]);
+       bit = sync ? WB_sync_congested : WB_async_congested;
+       if (test_and_clear_bit(bit, &congested->state))
+               atomic_dec(&nr_wb_congested[sync]);
        smp_mb__after_atomic();
        if (waitqueue_active(wqh))
                wake_up(wqh);
  }
- EXPORT_SYMBOL(clear_bdi_congested);
+ EXPORT_SYMBOL(clear_wb_congested);
  
- void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+ void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
  {
-       enum bdi_state bit;
+       enum wb_state bit;
  
-       bit = sync ? BDI_sync_congested : BDI_async_congested;
-       if (!test_and_set_bit(bit, &bdi->state))
-               atomic_inc(&nr_bdi_congested[sync]);
+       bit = sync ? WB_sync_congested : WB_async_congested;
+       if (!test_and_set_bit(bit, &congested->state))
+               atomic_inc(&nr_wb_congested[sync]);
  }
- EXPORT_SYMBOL(set_bdi_congested);
+ EXPORT_SYMBOL(set_wb_congested);
  
  /**
   * congestion_wait - wait for a backing_dev to become uncongested
@@@ -555,7 -954,7 +937,7 @@@ long wait_iff_congested(struct zone *zo
         * encountered in the current zone, yield if necessary instead
         * of sleeping on the congestion queue
         */
-       if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+       if (atomic_read(&nr_wb_congested[sync]) == 0 ||
            !test_bit(ZONE_CONGESTED, &zone->flags)) {
                cond_resched();
  
diff --combined mm/filemap.c
index 8d17ceea8dbeb1f641687407f2a27ebbff480533,bfc1ab053b1224acd96a1851e265f6872d744881..11f10efd637c2d67e071c482951e2bb38a105d6b
   *    ->tree_lock             (page_remove_rmap->set_page_dirty)
   *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
   *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
+  *    ->memcg->move_lock      (page_remove_rmap->mem_cgroup_begin_page_stat)
   *    bdi.wb->list_lock               (zap_pte_range->set_page_dirty)
   *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
   *    ->private_lock          (zap_pte_range->__set_page_dirty_buffers)
@@@ -174,9 -175,11 +175,11 @@@ static void page_cache_tree_delete(stru
  /*
   * Delete a page from the page cache and free it. Caller has to make
   * sure the page is locked and that nobody else uses it - or that usage
-  * is safe.  The caller must hold the mapping's tree_lock.
+  * is safe.  The caller must hold the mapping's tree_lock and
+  * mem_cgroup_begin_page_stat().
   */
- void __delete_from_page_cache(struct page *page, void *shadow)
+ void __delete_from_page_cache(struct page *page, void *shadow,
+                             struct mem_cgroup *memcg)
  {
        struct address_space *mapping = page->mapping;
  
        page->mapping = NULL;
        /* Leave page->index set: truncation lookup relies upon it */
  
 -      __dec_zone_page_state(page, NR_FILE_PAGES);
 +      /* hugetlb pages do not participate in page cache accounting. */
 +      if (!PageHuge(page))
 +              __dec_zone_page_state(page, NR_FILE_PAGES);
        if (PageSwapBacked(page))
                __dec_zone_page_state(page, NR_SHMEM);
        BUG_ON(page_mapped(page));
         * anyway will be cleared before returning page into buddy allocator.
         */
        if (WARN_ON_ONCE(PageDirty(page)))
-               account_page_cleaned(page, mapping);
+               account_page_cleaned(page, mapping, memcg,
+                                    inode_to_wb(mapping->host));
  }
  
  /**
  void delete_from_page_cache(struct page *page)
  {
        struct address_space *mapping = page->mapping;
+       struct mem_cgroup *memcg;
+       unsigned long flags;
        void (*freepage)(struct page *);
  
        BUG_ON(!PageLocked(page));
  
        freepage = mapping->a_ops->freepage;
-       spin_lock_irq(&mapping->tree_lock);
-       __delete_from_page_cache(page, NULL);
-       spin_unlock_irq(&mapping->tree_lock);
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
+       __delete_from_page_cache(page, NULL, memcg);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
  
        if (freepage)
                freepage(page);
@@@ -283,7 -291,9 +293,9 @@@ int __filemap_fdatawrite_range(struct a
        if (!mapping_cap_writeback_dirty(mapping))
                return 0;
  
+       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
        ret = do_writepages(mapping, &wbc);
+       wbc_detach_inode(&wbc);
        return ret;
  }
  
@@@ -472,6 -482,8 +484,8 @@@ int replace_page_cache_page(struct pag
        if (!error) {
                struct address_space *mapping = old->mapping;
                void (*freepage)(struct page *);
+               struct mem_cgroup *memcg;
+               unsigned long flags;
  
                pgoff_t offset = old->index;
                freepage = mapping->a_ops->freepage;
                new->mapping = mapping;
                new->index = offset;
  
-               spin_lock_irq(&mapping->tree_lock);
-               __delete_from_page_cache(old, NULL);
+               memcg = mem_cgroup_begin_page_stat(old);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+               __delete_from_page_cache(old, NULL, memcg);
                error = radix_tree_insert(&mapping->page_tree, offset, new);
                BUG_ON(error);
                mapping->nrpages++;
 -              __inc_zone_page_state(new, NR_FILE_PAGES);
 +
 +              /*
 +               * hugetlb pages do not participate in page cache accounting.
 +               */
 +              if (!PageHuge(new))
 +                      __inc_zone_page_state(new, NR_FILE_PAGES);
                if (PageSwapBacked(new))
                        __inc_zone_page_state(new, NR_SHMEM);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                mem_cgroup_migrate(old, new, true);
                radix_tree_preload_end();
                if (freepage)
@@@ -582,10 -591,7 +598,10 @@@ static int __add_to_page_cache_locked(s
        radix_tree_preload_end();
        if (unlikely(error))
                goto err_insert;
 -      __inc_zone_page_state(page, NR_FILE_PAGES);
 +
 +      /* hugetlb pages do not participate in page cache accounting. */
 +      if (!huge)
 +              __inc_zone_page_state(page, NR_FILE_PAGES);
        spin_unlock_irq(&mapping->tree_lock);
        if (!huge)
                mem_cgroup_commit_charge(page, memcg, false);
@@@ -1664,8 -1670,8 +1680,8 @@@ no_cached_page
                        error = -ENOMEM;
                        goto out;
                }
 -              error = add_to_page_cache_lru(page, mapping,
 -                                              index, GFP_KERNEL);
 +              error = add_to_page_cache_lru(page, mapping, index,
 +                                      GFP_KERNEL & mapping_gfp_mask(mapping));
                if (error) {
                        page_cache_release(page);
                        if (error == -EEXIST) {
@@@ -1766,8 -1772,7 +1782,8 @@@ static int page_cache_read(struct file 
                if (!page)
                        return -ENOMEM;
  
 -              ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
 +              ret = add_to_page_cache_lru(page, mapping, offset,
 +                              GFP_KERNEL & mapping_gfp_mask(mapping));
                if (ret == 0)
                        ret = mapping->a_ops->readpage(file, page);
                else if (ret == -EEXIST)
diff --combined mm/memcontrol.c
index e65f7b0131d3598cb5ba0ce3497d47b43d676dea,f816d91c643b7ee59af809b72f846ddddced8924..acb93c554f6e8456dc9312734162317d1adea54d
@@@ -77,6 -77,7 +77,7 @@@ EXPORT_SYMBOL(memory_cgrp_subsys)
  
  #define MEM_CGROUP_RECLAIM_RETRIES    5
  static struct mem_cgroup *root_mem_cgroup __read_mostly;
+ struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
  
  /* Whether the swap controller is active */
  #ifdef CONFIG_MEMCG_SWAP
@@@ -90,6 -91,7 +91,7 @@@ static const char * const mem_cgroup_st
        "rss",
        "rss_huge",
        "mapped_file",
+       "dirty",
        "writeback",
        "swap",
  };
@@@ -285,9 -287,9 +287,9 @@@ struct mem_cgroup 
         */
        bool use_hierarchy;
  
 +      /* protected by memcg_oom_lock */
        bool            oom_lock;
 -      atomic_t        under_oom;
 -      atomic_t        oom_wakeups;
 +      int             under_oom;
  
        int     swappiness;
        /* OOM-Killer disable */
         * percpu counter.
         */
        struct mem_cgroup_stat_cpu __percpu *stat;
-       /*
-        * used when a cpu is offlined or other synchronizations
-        * See mem_cgroup_read_stat().
-        */
-       struct mem_cgroup_stat_cpu nocpu_base;
        spinlock_t pcp_counter_lock;
  
  #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
        atomic_t        numainfo_updating;
  #endif
  
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head cgwb_list;
+       struct wb_domain cgwb_domain;
+ #endif
        /* List of events which userspace want to receive */
        struct list_head event_list;
        spinlock_t event_list_lock;
@@@ -596,6 -598,39 +598,39 @@@ struct cgroup_subsys_state *mem_cgroup_
        return &memcg->css;
  }
  
+ /**
+  * mem_cgroup_css_from_page - css of the memcg associated with a page
+  * @page: page of interest
+  *
+  * If memcg is bound to the default hierarchy, css of the memcg associated
+  * with @page is returned.  The returned css remains associated with @page
+  * until it is released.
+  *
+  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+  * is returned.
+  *
+  * XXX: The above description of behavior on the default hierarchy isn't
+  * strictly true yet as replace_page_cache_page() can modify the
+  * association before @page is released even on the default hierarchy;
+  * however, the current and planned usages don't mix the the two functions
+  * and replace_page_cache_page() will soon be updated to make the invariant
+  * actually true.
+  */
+ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+ {
+       struct mem_cgroup *memcg;
+       rcu_read_lock();
+       memcg = page->mem_cgroup;
+       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+               memcg = root_mem_cgroup;
+       rcu_read_unlock();
+       return &memcg->css;
+ }
  static struct mem_cgroup_per_zone *
  mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
  {
@@@ -795,15 -830,8 +830,8 @@@ static long mem_cgroup_read_stat(struc
        long val = 0;
        int cpu;
  
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->count[idx], cpu);
- #ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.count[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
- #endif
-       put_online_cpus();
        return val;
  }
  
@@@ -813,15 -841,8 +841,8 @@@ static unsigned long mem_cgroup_read_ev
        unsigned long val = 0;
        int cpu;
  
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                val += per_cpu(memcg->stat->events[idx], cpu);
- #ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.events[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
- #endif
-       put_online_cpus();
        return val;
  }
  
@@@ -1530,16 -1551,14 +1551,16 @@@ static void mem_cgroup_out_of_memory(st
        unsigned int points = 0;
        struct task_struct *chosen = NULL;
  
 +      mutex_lock(&oom_lock);
 +
        /*
         * If current has a pending SIGKILL or is exiting, then automatically
         * select it.  The goal is to allow it to allocate so that it may
         * quickly exit and free its memory.
         */
        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
 -              mark_tsk_oom_victim(current);
 -              return;
 +              mark_oom_victim(current);
 +              goto unlock;
        }
  
        check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
                                mem_cgroup_iter_break(memcg, iter);
                                if (chosen)
                                        put_task_struct(chosen);
 -                              return;
 +                              goto unlock;
                        case OOM_SCAN_OK:
                                break;
                        };
                css_task_iter_end(&it);
        }
  
 -      if (!chosen)
 -              return;
 -      points = chosen_points * 1000 / totalpages;
 -      oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
 -                       NULL, "Memory cgroup out of memory");
 +      if (chosen) {
 +              points = chosen_points * 1000 / totalpages;
 +              oom_kill_process(chosen, gfp_mask, order, points, totalpages,
 +                               memcg, NULL, "Memory cgroup out of memory");
 +      }
 +unlock:
 +      mutex_unlock(&oom_lock);
  }
  
  #if MAX_NUMNODES > 1
@@@ -1810,10 -1827,8 +1831,10 @@@ static void mem_cgroup_mark_under_oom(s
  {
        struct mem_cgroup *iter;
  
 +      spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
 -              atomic_inc(&iter->under_oom);
 +              iter->under_oom++;
 +      spin_unlock(&memcg_oom_lock);
  }
  
  static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
  
        /*
         * When a new child is created while the hierarchy is under oom,
 -       * mem_cgroup_oom_lock() may not be called. We have to use
 -       * atomic_add_unless() here.
 +       * mem_cgroup_oom_lock() may not be called. Watch for underflow.
         */
 +      spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
 -              atomic_add_unless(&iter->under_oom, -1, 0);
 +              if (iter->under_oom > 0)
 +                      iter->under_oom--;
 +      spin_unlock(&memcg_oom_lock);
  }
  
  static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@@ -1854,18 -1867,17 +1875,18 @@@ static int memcg_oom_wake_function(wait
        return autoremove_wake_function(wait, mode, sync, arg);
  }
  
 -static void memcg_wakeup_oom(struct mem_cgroup *memcg)
 -{
 -      atomic_inc(&memcg->oom_wakeups);
 -      /* for filtering, pass "memcg" as argument. */
 -      __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
 -}
 -
  static void memcg_oom_recover(struct mem_cgroup *memcg)
  {
 -      if (memcg && atomic_read(&memcg->under_oom))
 -              memcg_wakeup_oom(memcg);
 +      /*
 +       * For the following lockless ->under_oom test, the only required
 +       * guarantee is that it must see the state asserted by an OOM when
 +       * this function is called as a result of userland actions
 +       * triggered by the notification of the OOM.  This is trivially
 +       * achieved by invoking mem_cgroup_mark_under_oom() before
 +       * triggering notification.
 +       */
 +      if (memcg && memcg->under_oom)
 +              __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
  }
  
  static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@@ -2020,6 -2032,7 +2041,7 @@@ again
  
        return memcg;
  }
+ EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
  
  /**
   * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@@ -2038,6 -2051,7 +2060,7 @@@ void mem_cgroup_end_page_stat(struct me
  
        rcu_read_unlock();
  }
+ EXPORT_SYMBOL(mem_cgroup_end_page_stat);
  
  /**
   * mem_cgroup_update_page_stat - update page state statistics
@@@ -2178,37 -2192,12 +2201,12 @@@ static void drain_all_stock(struct mem_
        mutex_unlock(&percpu_charge_mutex);
  }
  
- /*
-  * This function drains percpu counter value from DEAD cpu and
-  * move it to local cpu. Note that this function can be preempted.
-  */
- static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
- {
-       int i;
-       spin_lock(&memcg->pcp_counter_lock);
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long x = per_cpu(memcg->stat->count[i], cpu);
-               per_cpu(memcg->stat->count[i], cpu) = 0;
-               memcg->nocpu_base.count[i] += x;
-       }
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-               unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-               per_cpu(memcg->stat->events[i], cpu) = 0;
-               memcg->nocpu_base.events[i] += x;
-       }
-       spin_unlock(&memcg->pcp_counter_lock);
- }
  static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                        unsigned long action,
                                        void *hcpu)
  {
        int cpu = (unsigned long)hcpu;
        struct memcg_stock_pcp *stock;
-       struct mem_cgroup *iter;
  
        if (action == CPU_ONLINE)
                return NOTIFY_OK;
        if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                return NOTIFY_OK;
  
-       for_each_mem_cgroup(iter)
-               mem_cgroup_drain_pcp_counter(iter, cpu);
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
        return NOTIFY_OK;
@@@ -2332,8 -2318,6 +2327,8 @@@ done_restock
        css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
 +      if (!(gfp_mask & __GFP_WAIT))
 +              goto done;
        /*
         * If the hierarchy is above the normal consumption range,
         * make the charging task trim their excess contribution.
@@@ -3873,7 -3857,7 +3868,7 @@@ static int mem_cgroup_oom_register_even
        list_add(&event->list, &memcg->oom_notify);
  
        /* already in OOM ? */
 -      if (atomic_read(&memcg->under_oom))
 +      if (memcg->under_oom)
                eventfd_signal(eventfd, 1);
        spin_unlock(&memcg_oom_lock);
  
@@@ -3902,7 -3886,7 +3897,7 @@@ static int mem_cgroup_oom_control_read(
        struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
  
        seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
 -      seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
 +      seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
        return 0;
  }
  
@@@ -4004,6 -3988,98 +3999,98 @@@ static void memcg_destroy_kmem(struct m
  }
  #endif
  
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+ {
+       return &memcg->cgwb_list;
+ }
+ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+ {
+       return wb_domain_init(&memcg->cgwb_domain, gfp);
+ }
+ static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+ {
+       wb_domain_exit(&memcg->cgwb_domain);
+ }
+ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+ {
+       wb_domain_size_changed(&memcg->cgwb_domain);
+ }
+ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       if (!memcg->css.parent)
+               return NULL;
+       return &memcg->cgwb_domain;
+ }
+ /**
+  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+  * @wb: bdi_writeback in question
+  * @pavail: out parameter for number of available pages
+  * @pdirty: out parameter for number of dirty pages
+  * @pwriteback: out parameter for number of pages under writeback
+  *
+  * Determine the numbers of available, dirty, and writeback pages in @wb's
+  * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
+  * more involved.
+  *
+  * A memcg's headroom is "min(max, high) - used".  The available memory is
+  * calculated as the lowest headroom of itself and the ancestors plus the
+  * number of pages already being used for file pages.  Note that this
+  * doesn't consider the actual amount of available memory in the system.
+  * The caller should further cap *@pavail accordingly.
+  */
+ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+       unsigned long head_room = PAGE_COUNTER_MAX;
+       unsigned long file_pages;
+       *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+       /* this should eventually include NR_UNSTABLE_NFS */
+       *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+       file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                   (1 << LRU_ACTIVE_FILE));
+       while ((parent = parent_mem_cgroup(memcg))) {
+               unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+               unsigned long used = page_counter_read(&memcg->memory);
+               head_room = min(head_room, ceiling - min(ceiling, used));
+               memcg = parent;
+       }
+       *pavail = file_pages + head_room;
+ }
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+ {
+       return 0;
+ }
+ static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+ {
+ }
+ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+ {
+ }
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
  /*
   * DO NOT USE IN NEW FILES.
   *
@@@ -4388,9 -4464,15 +4475,15 @@@ static struct mem_cgroup *mem_cgroup_al
        memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
        if (!memcg->stat)
                goto out_free;
+       if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+               goto out_free_stat;
        spin_lock_init(&memcg->pcp_counter_lock);
        return memcg;
  
+ out_free_stat:
+       free_percpu(memcg->stat);
  out_free:
        kfree(memcg);
        return NULL;
@@@ -4417,6 -4499,7 +4510,7 @@@ static void __mem_cgroup_free(struct me
                free_mem_cgroup_per_zone_info(memcg, node);
  
        free_percpu(memcg->stat);
+       memcg_wb_domain_exit(memcg);
        kfree(memcg);
  }
  
@@@ -4449,6 -4532,7 +4543,7 @@@ mem_cgroup_css_alloc(struct cgroup_subs
        /* root ? */
        if (parent_css == NULL) {
                root_mem_cgroup = memcg;
+               mem_cgroup_root_css = &memcg->css;
                page_counter_init(&memcg->memory, NULL);
                memcg->high = PAGE_COUNTER_MAX;
                memcg->soft_limit = PAGE_COUNTER_MAX;
  #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
  #endif
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&memcg->cgwb_list);
+ #endif
        return &memcg->css;
  
  free_out:
@@@ -4555,6 -4641,8 +4652,8 @@@ static void mem_cgroup_css_offline(stru
        vmpressure_cleanup(&memcg->vmpressure);
  
        memcg_deactivate_kmem(memcg);
+       wb_memcg_offline(memcg);
  }
  
  static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@@ -4588,6 -4676,7 +4687,7 @@@ static void mem_cgroup_css_reset(struc
        memcg->low = 0;
        memcg->high = PAGE_COUNTER_MAX;
        memcg->soft_limit = PAGE_COUNTER_MAX;
+       memcg_wb_domain_size_changed(memcg);
  }
  
  #ifdef CONFIG_MMU
@@@ -4757,6 -4846,7 +4857,7 @@@ static int mem_cgroup_move_account(stru
  {
        unsigned long flags;
        int ret;
+       bool anon;
  
        VM_BUG_ON(from == to);
        VM_BUG_ON_PAGE(PageLRU(page), page);
        if (page->mem_cgroup != from)
                goto out_unlock;
  
+       anon = PageAnon(page);
        spin_lock_irqsave(&from->move_lock, flags);
  
-       if (!PageAnon(page) && page_mapped(page)) {
+       if (!anon && page_mapped(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
                __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                               nr_pages);
        }
  
+       /*
+        * move_lock grabbed above and caller set from->moving_account, so
+        * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+        * So mapping should be stable for dirty pages.
+        */
+       if (!anon && PageDirty(page)) {
+               struct address_space *mapping = page_mapping(page);
+               if (mapping_cap_account_dirty(mapping)) {
+                       __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+                       __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+               }
+       }
        if (PageWriteback(page)) {
                __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
                               nr_pages);
@@@ -5306,6 -5414,7 +5425,7 @@@ static ssize_t memory_high_write(struc
  
        memcg->high = high;
  
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
  }
  
@@@ -5338,6 -5447,7 +5458,7 @@@ static ssize_t memory_max_write(struct 
        if (err)
                return err;
  
+       memcg_wb_domain_size_changed(memcg);
        return nbytes;
  }
  
@@@ -5844,7 -5954,9 +5965,7 @@@ void mem_cgroup_swapout(struct page *pa
        if (!mem_cgroup_is_root(memcg))
                page_counter_uncharge(&memcg->memory, 1);
  
 -      /* XXX: caller holds IRQ-safe mapping->tree_lock */
 -      VM_BUG_ON(!irqs_disabled());
 -
 +      /* Caller disabled preemption with mapping->tree_lock */
        mem_cgroup_charge_statistics(memcg, page, -1);
        memcg_check_events(memcg, page);
  }
diff --combined mm/page-writeback.c
index eb59f7eea50827fc09e1c4f7a432b59ff2241d17,e1514d5b4e9bf62b5ae3552fb6f35900776985d9..22cddd3e5de8433952e99438d3260ae9ff20bd8d
@@@ -122,31 -122,31 +122,31 @@@ EXPORT_SYMBOL(laptop_mode)
  
  /* End of sysctl-exported parameters */
  
unsigned long global_dirty_limit;
struct wb_domain global_wb_domain;
  
- /*
-  * Scale the writeback cache size proportional to the relative writeout speeds.
-  *
-  * We do this by keeping a floating proportion between BDIs, based on page
-  * writeback completions [end_page_writeback()]. Those devices that write out
-  * pages fastest will get the larger share, while the slower will get a smaller
-  * share.
-  *
-  * We use page writeout completions because we are interested in getting rid of
-  * dirty pages. Having them written out is the primary goal.
-  *
-  * We introduce a concept of time, a period over which we measure these events,
-  * because demand can/will vary over time. The length of this period itself is
-  * measured in page writeback completions.
-  *
-  */
- static struct fprop_global writeout_completions;
+ /* consolidated parameters for balance_dirty_pages() and its subroutines */
+ struct dirty_throttle_control {
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct wb_domain        *dom;
+       struct dirty_throttle_control *gdtc;    /* only set in memcg dtc's */
+ #endif
+       struct bdi_writeback    *wb;
+       struct fprop_local_percpu *wb_completions;
  
- static void writeout_period(unsigned long t);
- /* Timer for aging of writeout_completions */
- static struct timer_list writeout_period_timer =
-               TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
- static unsigned long writeout_period_time = 0;
+       unsigned long           avail;          /* dirtyable */
+       unsigned long           dirty;          /* file_dirty + write + nfs */
+       unsigned long           thresh;         /* dirty threshold */
+       unsigned long           bg_thresh;      /* dirty background threshold */
+       unsigned long           wb_dirty;       /* per-wb counterparts */
+       unsigned long           wb_thresh;
+       unsigned long           wb_bg_thresh;
+       unsigned long           pos_ratio;
+ };
+ #define DTC_INIT_COMMON(__wb) .wb = (__wb),                           \
+                               .wb_completions = &(__wb)->completions
  
  /*
   * Length of period for aging writeout fractions of bdis. This is an
   */
  #define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
  
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ #define GDTC_INIT(__wb)               .dom = &global_wb_domain,               \
+                               DTC_INIT_COMMON(__wb)
+ #define GDTC_INIT_NO_WB               .dom = &global_wb_domain
+ #define MDTC_INIT(__wb, __gdtc)       .dom = mem_cgroup_wb_domain(__wb),      \
+                               .gdtc = __gdtc,                         \
+                               DTC_INIT_COMMON(__wb)
+ static bool mdtc_valid(struct dirty_throttle_control *dtc)
+ {
+       return dtc->dom;
+ }
+ static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+ {
+       return dtc->dom;
+ }
+ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+ {
+       return mdtc->gdtc;
+ }
+ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+ {
+       return &wb->memcg_completions;
+ }
+ static void wb_min_max_ratio(struct bdi_writeback *wb,
+                            unsigned long *minp, unsigned long *maxp)
+ {
+       unsigned long this_bw = wb->avg_write_bandwidth;
+       unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+       unsigned long long min = wb->bdi->min_ratio;
+       unsigned long long max = wb->bdi->max_ratio;
+       /*
+        * @wb may already be clean by the time control reaches here and
+        * the total may not include its bw.
+        */
+       if (this_bw < tot_bw) {
+               if (min) {
+                       min *= this_bw;
+                       do_div(min, tot_bw);
+               }
+               if (max < 100) {
+                       max *= this_bw;
+                       do_div(max, tot_bw);
+               }
+       }
+       *minp = min;
+       *maxp = max;
+ }
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ #define GDTC_INIT(__wb)               DTC_INIT_COMMON(__wb)
+ #define GDTC_INIT_NO_WB
+ #define MDTC_INIT(__wb, __gdtc)
+ static bool mdtc_valid(struct dirty_throttle_control *dtc)
+ {
+       return false;
+ }
+ static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+ {
+       return &global_wb_domain;
+ }
+ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+ {
+       return NULL;
+ }
+ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+ {
+       return NULL;
+ }
+ static void wb_min_max_ratio(struct bdi_writeback *wb,
+                            unsigned long *minp, unsigned long *maxp)
+ {
+       *minp = wb->bdi->min_ratio;
+       *maxp = wb->bdi->max_ratio;
+ }
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
  /*
   * In a memory zone, there is a certain amount of pages we consider
   * available for the page cache, which is essentially the number of
@@@ -250,42 -341,88 +341,88 @@@ static unsigned long global_dirtyable_m
        return x + 1;   /* Ensure that we never return 0 */
  }
  
- /*
-  * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ /**
+  * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
+  * @dtc: dirty_throttle_control of interest
   *
-  * Calculate the dirty thresholds based on sysctl parameters
-  * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
-  * - vm.dirty_ratio             or  vm.dirty_bytes
-  * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+  * Calculate @dtc->thresh and ->bg_thresh considering
+  * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
+  * must ensure that @dtc->avail is set before calling this function.  The
+  * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
   * real-time tasks.
   */
void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
  {
-       const unsigned long available_memory = global_dirtyable_memory();
-       unsigned long background;
-       unsigned long dirty;
+       const unsigned long available_memory = dtc->avail;
+       struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
+       unsigned long bytes = vm_dirty_bytes;
+       unsigned long bg_bytes = dirty_background_bytes;
+       unsigned long ratio = vm_dirty_ratio;
+       unsigned long bg_ratio = dirty_background_ratio;
+       unsigned long thresh;
+       unsigned long bg_thresh;
        struct task_struct *tsk;
  
-       if (vm_dirty_bytes)
-               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+       /* gdtc is !NULL iff @dtc is for memcg domain */
+       if (gdtc) {
+               unsigned long global_avail = gdtc->avail;
+               /*
+                * The byte settings can't be applied directly to memcg
+                * domains.  Convert them to ratios by scaling against
+                * globally available memory.
+                */
+               if (bytes)
+                       ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
+                                   global_avail, 100UL);
+               if (bg_bytes)
+                       bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
+                                      global_avail, 100UL);
+               bytes = bg_bytes = 0;
+       }
+       if (bytes)
+               thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
        else
-               dirty = (vm_dirty_ratio * available_memory) / 100;
+               thresh = (ratio * available_memory) / 100;
  
-       if (dirty_background_bytes)
-               background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+       if (bg_bytes)
+               bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
        else
-               background = (dirty_background_ratio * available_memory) / 100;
+               bg_thresh = (bg_ratio * available_memory) / 100;
  
-       if (background >= dirty)
-               background = dirty / 2;
+       if (bg_thresh >= thresh)
+               bg_thresh = thresh / 2;
        tsk = current;
        if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-               background += background / 4;
-               dirty += dirty / 4;
+               bg_thresh += bg_thresh / 4;
+               thresh += thresh / 4;
        }
-       *pbackground = background;
-       *pdirty = dirty;
-       trace_global_dirty_state(background, dirty);
+       dtc->thresh = thresh;
+       dtc->bg_thresh = bg_thresh;
+       /* we should eventually report the domain in the TP */
+       if (!gdtc)
+               trace_global_dirty_state(bg_thresh, thresh);
+ }
+ /**
+  * global_dirty_limits - background-writeback and dirty-throttling thresholds
+  * @pbackground: out parameter for bg_thresh
+  * @pdirty: out parameter for thresh
+  *
+  * Calculate bg_thresh and thresh for global_wb_domain.  See
+  * domain_dirty_limits() for details.
+  */
+ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+ {
+       struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+       gdtc.avail = global_dirtyable_memory();
+       domain_dirty_limits(&gdtc);
+       *pbackground = gdtc.bg_thresh;
+       *pdirty = gdtc.thresh;
  }
  
  /**
@@@ -392,47 -529,52 +529,52 @@@ static unsigned long wp_next_time(unsig
        return cur_time;
  }
  
- /*
-  * Increment the BDI's writeout completion count and the global writeout
-  * completion count. Called from test_clear_page_writeback().
-  */
- static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+ static void wb_domain_writeout_inc(struct wb_domain *dom,
+                                  struct fprop_local_percpu *completions,
+                                  unsigned int max_prop_frac)
  {
-       __inc_bdi_stat(bdi, BDI_WRITTEN);
-       __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-                              bdi->max_prop_frac);
+       __fprop_inc_percpu_max(&dom->completions, completions,
+                              max_prop_frac);
        /* First event after period switching was turned off? */
-       if (!unlikely(writeout_period_time)) {
+       if (!unlikely(dom->period_time)) {
                /*
                 * We can race with other __bdi_writeout_inc calls here but
                 * it does not cause any harm since the resulting time when
                 * timer will fire and what is in writeout_period_time will be
                 * roughly the same.
                 */
-               writeout_period_time = wp_next_time(jiffies);
-               mod_timer(&writeout_period_timer, writeout_period_time);
+               dom->period_time = wp_next_time(jiffies);
+               mod_timer(&dom->period_timer, dom->period_time);
        }
  }
  
- void bdi_writeout_inc(struct backing_dev_info *bdi)
+ /*
+  * Increment @wb's writeout completion count and the global writeout
+  * completion count. Called from test_clear_page_writeback().
+  */
+ static inline void __wb_writeout_inc(struct bdi_writeback *wb)
  {
-       unsigned long flags;
+       struct wb_domain *cgdom;
  
-       local_irq_save(flags);
-       __bdi_writeout_inc(bdi);
-       local_irq_restore(flags);
+       __inc_wb_stat(wb, WB_WRITTEN);
+       wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+                              wb->bdi->max_prop_frac);
+       cgdom = mem_cgroup_wb_domain(wb);
+       if (cgdom)
+               wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+                                      wb->bdi->max_prop_frac);
  }
- EXPORT_SYMBOL_GPL(bdi_writeout_inc);
  
- /*
-  * Obtain an accurate fraction of the BDI's portion.
-  */
- static void bdi_writeout_fraction(struct backing_dev_info *bdi,
-               long *numerator, long *denominator)
+ void wb_writeout_inc(struct bdi_writeback *wb)
  {
-       fprop_fraction_percpu(&writeout_completions, &bdi->completions,
-                               numerator, denominator);
+       unsigned long flags;
+       local_irq_save(flags);
+       __wb_writeout_inc(wb);
+       local_irq_restore(flags);
  }
+ EXPORT_SYMBOL_GPL(wb_writeout_inc);
  
  /*
   * On idle system, we can be called long after we scheduled because we use
   */
  static void writeout_period(unsigned long t)
  {
-       int miss_periods = (jiffies - writeout_period_time) /
+       struct wb_domain *dom = (void *)t;
+       int miss_periods = (jiffies - dom->period_time) /
                                                 VM_COMPLETIONS_PERIOD_LEN;
  
-       if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
-               writeout_period_time = wp_next_time(writeout_period_time +
+       if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+               dom->period_time = wp_next_time(dom->period_time +
                                miss_periods * VM_COMPLETIONS_PERIOD_LEN);
-               mod_timer(&writeout_period_timer, writeout_period_time);
+               mod_timer(&dom->period_timer, dom->period_time);
        } else {
                /*
                 * Aging has zeroed all fractions. Stop wasting CPU on period
                 * updates.
                 */
-               writeout_period_time = 0;
+               dom->period_time = 0;
        }
  }
  
+ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+ {
+       memset(dom, 0, sizeof(*dom));
+       spin_lock_init(&dom->lock);
+       init_timer_deferrable(&dom->period_timer);
+       dom->period_timer.function = writeout_period;
+       dom->period_timer.data = (unsigned long)dom;
+       dom->dirty_limit_tstamp = jiffies;
+       return fprop_global_init(&dom->completions, gfp);
+ }
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ void wb_domain_exit(struct wb_domain *dom)
+ {
+       del_timer_sync(&dom->period_timer);
+       fprop_global_destroy(&dom->completions);
+ }
+ #endif
  /*
   * bdi_min_ratio keeps the sum of the minimum dirty shares of all
   * registered backing devices, which, for obvious reasons, can not
@@@ -510,17 -676,26 +676,26 @@@ static unsigned long dirty_freerun_ceil
        return (thresh + bg_thresh) / 2;
  }
  
- static unsigned long hard_dirty_limit(unsigned long thresh)
+ static unsigned long hard_dirty_limit(struct wb_domain *dom,
+                                     unsigned long thresh)
  {
-       return max(thresh, global_dirty_limit);
+       return max(thresh, dom->dirty_limit);
+ }
+ /* memory available to a memcg domain is capped by system-wide clean memory */
+ static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+ {
+       struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+       unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+       mdtc->avail = min(mdtc->avail, clean);
  }
  
  /**
-  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
-  * @bdi: the backing_dev_info to query
-  * @dirty: global dirty limit in pages
+  * __wb_calc_thresh - @wb's share of dirty throttling threshold
+  * @dtc: dirty_throttle_context of interest
   *
-  * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+  * Returns @wb's dirty limit in pages. The term "dirty" in the context of
   * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
   *
   * Note that balance_dirty_pages() will only seriously take it as a hard limit
   * control. For example, when the device is completely stalled due to some error
   * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
   * In the other normal situations, it acts more gently by throttling the tasks
-  * more (rather than completely block them) when the bdi dirty pages go high.
+  * more (rather than completely block them) when the wb dirty pages go high.
   *
   * It allocates high/low dirty limits to fast/slow devices, in order to prevent
   * - starving fast devices
   * - piling up dirty pages (that will take long time to sync) on slow devices
   *
-  * The bdi's share of dirty limit will be adapting to its throughput and
+  * The wb's share of dirty limit will be adapting to its throughput and
   * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
   */
unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
  {
-       u64 bdi_dirty;
+       struct wb_domain *dom = dtc_dom(dtc);
+       unsigned long thresh = dtc->thresh;
+       u64 wb_thresh;
        long numerator, denominator;
+       unsigned long wb_min_ratio, wb_max_ratio;
  
        /*
-        * Calculate this BDI's share of the dirty ratio.
+        * Calculate this BDI's share of the thresh ratio.
         */
-       bdi_writeout_fraction(bdi, &numerator, &denominator);
+       fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
+                             &numerator, &denominator);
+       wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+       wb_thresh *= numerator;
+       do_div(wb_thresh, denominator);
  
-       bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-       bdi_dirty *= numerator;
-       do_div(bdi_dirty, denominator);
+       wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
  
-       bdi_dirty += (dirty * bdi->min_ratio) / 100;
-       if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-               bdi_dirty = dirty * bdi->max_ratio / 100;
+       wb_thresh += (thresh * wb_min_ratio) / 100;
+       if (wb_thresh > (thresh * wb_max_ratio) / 100)
+               wb_thresh = thresh * wb_max_ratio / 100;
  
-       return bdi_dirty;
+       return wb_thresh;
+ }
+ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
+ {
+       struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
+                                              .thresh = thresh };
+       return __wb_calc_thresh(&gdtc);
  }
  
  /*
@@@ -580,7 -768,7 +768,7 @@@ static long long pos_ratio_polynom(unsi
        long x;
  
        x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
 -                  limit - setpoint + 1);
 +                    (limit - setpoint) | 1);
        pos_ratio = x;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
        pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
   *
   * (o) global/bdi setpoints
   *
-  * We want the dirty pages be balanced around the global/bdi setpoints.
+  * We want the dirty pages be balanced around the global/wb setpoints.
   * When the number of dirty pages is higher/lower than the setpoint, the
   * dirty position control ratio (and hence task dirty ratelimit) will be
   * decreased/increased to bring the dirty pages back to the setpoint.
   *     if (dirty < setpoint) scale up   pos_ratio
   *     if (dirty > setpoint) scale down pos_ratio
   *
-  *     if (bdi_dirty < bdi_setpoint) scale up   pos_ratio
-  *     if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+  *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
+  *     if (wb_dirty > wb_setpoint) scale down pos_ratio
   *
   *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
   *
   *   0 +------------.------------------.----------------------*------------->
   *           freerun^          setpoint^                 limit^   dirty pages
   *
-  * (o) bdi control line
+  * (o) wb control line
   *
   *     ^ pos_ratio
   *     |
   *     |                      .                           .
   *     |                      .                             .
   *   0 +----------------------.-------------------------------.------------->
-  *                bdi_setpoint^                    x_intercept^
+  *                wb_setpoint^                    x_intercept^
   *
-  * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+  * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
   * be smoothly throttled down to normal if it starts high in situations like
   * - start writing to a slow SD card and a fast disk at the same time. The SD
-  *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
-  * - the bdi dirty thresh drops quickly due to change of JBOD workload
+  *   card's wb_dirty may rush to many times higher than wb_setpoint.
+  * - the wb dirty thresh drops quickly due to change of JBOD workload
   */
- static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
-                                       unsigned long thresh,
-                                       unsigned long bg_thresh,
-                                       unsigned long dirty,
-                                       unsigned long bdi_thresh,
-                                       unsigned long bdi_dirty)
- {
-       unsigned long write_bw = bdi->avg_write_bandwidth;
-       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-       unsigned long limit = hard_dirty_limit(thresh);
+ static void wb_position_ratio(struct dirty_throttle_control *dtc)
+ {
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+       unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+       unsigned long wb_thresh = dtc->wb_thresh;
        unsigned long x_intercept;
        unsigned long setpoint;         /* dirty pages' target balance point */
-       unsigned long bdi_setpoint;
+       unsigned long wb_setpoint;
        unsigned long span;
        long long pos_ratio;            /* for scaling up/down the rate limit */
        long x;
  
-       if (unlikely(dirty >= limit))
-               return 0;
+       dtc->pos_ratio = 0;
+       if (unlikely(dtc->dirty >= limit))
+               return;
  
        /*
         * global setpoint
         * See comment for pos_ratio_polynom().
         */
        setpoint = (freerun + limit) / 2;
-       pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+       pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
  
        /*
         * The strictlimit feature is a tool preventing mistrusted filesystems
         * from growing a large number of dirty pages before throttling. For
-        * such filesystems balance_dirty_pages always checks bdi counters
-        * against bdi limits. Even if global "nr_dirty" is under "freerun".
+        * such filesystems balance_dirty_pages always checks wb counters
+        * against wb limits. Even if global "nr_dirty" is under "freerun".
         * This is especially important for fuse which sets bdi->max_ratio to
         * 1% by default. Without strictlimit feature, fuse writeback may
         * consume arbitrary amount of RAM because it is accounted in
         * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
         *
-        * Here, in bdi_position_ratio(), we calculate pos_ratio based on
-        * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+        * Here, in wb_position_ratio(), we calculate pos_ratio based on
+        * two values: wb_dirty and wb_thresh. Let's consider an example:
         * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
         * limits are set by default to 10% and 20% (background and throttle).
-        * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
-        * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
-        * about ~6K pages (as the average of background and throttle bdi
+        * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+        * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+        * about ~6K pages (as the average of background and throttle wb
         * limits). The 3rd order polynomial will provide positive feedback if
-        * bdi_dirty is under bdi_setpoint and vice versa.
+        * wb_dirty is under wb_setpoint and vice versa.
         *
         * Note, that we cannot use global counters in these calculations
-        * because we want to throttle process writing to a strictlimit BDI
+        * because we want to throttle process writing to a strictlimit wb
         * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
         * in the example above).
         */
-       if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-               long long bdi_pos_ratio;
-               unsigned long bdi_bg_thresh;
+       if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+               long long wb_pos_ratio;
  
-               if (bdi_dirty < 8)
-                       return min_t(long long, pos_ratio * 2,
-                                    2 << RATELIMIT_CALC_SHIFT);
+               if (dtc->wb_dirty < 8) {
+                       dtc->pos_ratio = min_t(long long, pos_ratio * 2,
+                                          2 << RATELIMIT_CALC_SHIFT);
+                       return;
+               }
  
-               if (bdi_dirty >= bdi_thresh)
-                       return 0;
+               if (dtc->wb_dirty >= wb_thresh)
+                       return;
  
-               bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
-               bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
-                                                    bdi_bg_thresh);
+               wb_setpoint = dirty_freerun_ceiling(wb_thresh,
+                                                   dtc->wb_bg_thresh);
  
-               if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
-                       return 0;
+               if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
+                       return;
  
-               bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
-                                                 bdi_thresh);
+               wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
+                                                wb_thresh);
  
                /*
-                * Typically, for strictlimit case, bdi_setpoint << setpoint
-                * and pos_ratio >> bdi_pos_ratio. In the other words global
+                * Typically, for strictlimit case, wb_setpoint << setpoint
+                * and pos_ratio >> wb_pos_ratio. In the other words global
                 * state ("dirty") is not limiting factor and we have to
-                * make decision based on bdi counters. But there is an
+                * make decision based on wb counters. But there is an
                 * important case when global pos_ratio should get precedence:
                 * global limits are exceeded (e.g. due to activities on other
-                * BDIs) while given strictlimit BDI is below limit.
+                * wb's) while given strictlimit wb is below limit.
                 *
-                * "pos_ratio * bdi_pos_ratio" would work for the case above,
+                * "pos_ratio * wb_pos_ratio" would work for the case above,
                 * but it would look too non-natural for the case of all
-                * activity in the system coming from a single strictlimit BDI
+                * activity in the system coming from a single strictlimit wb
                 * with bdi->max_ratio == 100%.
                 *
                 * Note that min() below somewhat changes the dynamics of the
                 * control system. Normally, pos_ratio value can be well over 3
-                * (when globally we are at freerun and bdi is well below bdi
+                * (when globally we are at freerun and wb is well below wb
                 * setpoint). Now the maximum pos_ratio in the same situation
                 * is 2. We might want to tweak this if we observe the control
                 * system is too slow to adapt.
                 */
-               return min(pos_ratio, bdi_pos_ratio);
+               dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
+               return;
        }
  
        /*
         * We have computed basic pos_ratio above based on global situation. If
-        * the bdi is over/under its share of dirty pages, we want to scale
+        * the wb is over/under its share of dirty pages, we want to scale
         * pos_ratio further down/up. That is done by the following mechanism.
         */
  
        /*
-        * bdi setpoint
+        * wb setpoint
         *
-        *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+        *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
         *
-        *                        x_intercept - bdi_dirty
+        *                        x_intercept - wb_dirty
         *                     := --------------------------
-        *                        x_intercept - bdi_setpoint
+        *                        x_intercept - wb_setpoint
         *
-        * The main bdi control line is a linear function that subjects to
+        * The main wb control line is a linear function that subjects to
         *
-        * (1) f(bdi_setpoint) = 1.0
-        * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
-        *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
+        * (1) f(wb_setpoint) = 1.0
+        * (2) k = - 1 / (8 * write_bw)  (in single wb case)
+        *     or equally: x_intercept = wb_setpoint + 8 * write_bw
         *
-        * For single bdi case, the dirty pages are observed to fluctuate
+        * For single wb case, the dirty pages are observed to fluctuate
         * regularly within range
-        *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+        *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
         * for various filesystems, where (2) can yield in a reasonable 12.5%
         * fluctuation range for pos_ratio.
         *
-        * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+        * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
         * own size, so move the slope over accordingly and choose a slope that
-        * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+        * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
         */
-       if (unlikely(bdi_thresh > thresh))
-               bdi_thresh = thresh;
+       if (unlikely(wb_thresh > dtc->thresh))
+               wb_thresh = dtc->thresh;
        /*
-        * It's very possible that bdi_thresh is close to 0 not because the
+        * It's very possible that wb_thresh is close to 0 not because the
         * device is slow, but that it has remained inactive for long time.
         * Honour such devices a reasonable good (hopefully IO efficient)
         * threshold, so that the occasional writes won't be blocked and active
         * writes can rampup the threshold quickly.
         */
-       bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+       wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
        /*
-        * scale global setpoint to bdi's:
-        *      bdi_setpoint = setpoint * bdi_thresh / thresh
+        * scale global setpoint to wb's:
+        *      wb_setpoint = setpoint * wb_thresh / thresh
         */
-       x = div_u64((u64)bdi_thresh << 16, thresh | 1);
-       bdi_setpoint = setpoint * (u64)x >> 16;
 -      x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1);
++      x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+       wb_setpoint = setpoint * (u64)x >> 16;
        /*
-        * Use span=(8*write_bw) in single bdi case as indicated by
-        * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+        * Use span=(8*write_bw) in single wb case as indicated by
+        * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
         *
-        *        bdi_thresh                    thresh - bdi_thresh
-        * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
-        *          thresh                            thresh
+        *        wb_thresh                    thresh - wb_thresh
+        * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+        *         thresh                           thresh
         */
-       span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
-       x_intercept = bdi_setpoint + span;
+       span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+       x_intercept = wb_setpoint + span;
  
-       if (bdi_dirty < x_intercept - span / 4) {
-               pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-                                     (x_intercept - bdi_setpoint) | 1);
+       if (dtc->wb_dirty < x_intercept - span / 4) {
+               pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
 -                                    x_intercept - wb_setpoint + 1);
++                                    (x_intercept - wb_setpoint) | 1);
        } else
                pos_ratio /= 4;
  
        /*
-        * bdi reserve area, safeguard against dirty pool underrun and disk idle
+        * wb reserve area, safeguard against dirty pool underrun and disk idle
         * It may push the desired control point of global dirty pages higher
         * than setpoint.
         */
-       x_intercept = bdi_thresh / 2;
-       if (bdi_dirty < x_intercept) {
-               if (bdi_dirty > x_intercept / 8)
-                       pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+       x_intercept = wb_thresh / 2;
+       if (dtc->wb_dirty < x_intercept) {
+               if (dtc->wb_dirty > x_intercept / 8)
+                       pos_ratio = div_u64(pos_ratio * x_intercept,
+                                           dtc->wb_dirty);
                else
                        pos_ratio *= 8;
        }
  
-       return pos_ratio;
+       dtc->pos_ratio = pos_ratio;
  }
  
- static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
-                                      unsigned long elapsed,
-                                      unsigned long written)
+ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+                                     unsigned long elapsed,
+                                     unsigned long written)
  {
        const unsigned long period = roundup_pow_of_two(3 * HZ);
-       unsigned long avg = bdi->avg_write_bandwidth;
-       unsigned long old = bdi->write_bandwidth;
+       unsigned long avg = wb->avg_write_bandwidth;
+       unsigned long old = wb->write_bandwidth;
        u64 bw;
  
        /*
         * @written may have decreased due to account_page_redirty().
         * Avoid underflowing @bw calculation.
         */
-       bw = written - min(written, bdi->written_stamp);
+       bw = written - min(written, wb->written_stamp);
        bw *= HZ;
        if (unlikely(elapsed > period)) {
                do_div(bw, elapsed);
                avg = bw;
                goto out;
        }
-       bw += (u64)bdi->write_bandwidth * (period - elapsed);
+       bw += (u64)wb->write_bandwidth * (period - elapsed);
        bw >>= ilog2(period);
  
        /*
                avg += (old - avg) >> 3;
  
  out:
-       bdi->write_bandwidth = bw;
-       bdi->avg_write_bandwidth = avg;
+       /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+       avg = max(avg, 1LU);
+       if (wb_has_dirty_io(wb)) {
+               long delta = avg - wb->avg_write_bandwidth;
+               WARN_ON_ONCE(atomic_long_add_return(delta,
+                                       &wb->bdi->tot_write_bandwidth) <= 0);
+       }
+       wb->write_bandwidth = bw;
+       wb->avg_write_bandwidth = avg;
  }
  
- /*
-  * The global dirtyable memory and dirty threshold could be suddenly knocked
-  * down by a large amount (eg. on the startup of KVM in a swapless system).
-  * This may throw the system into deep dirty exceeded state and throttle
-  * heavy/light dirtiers alike. To retain good responsiveness, maintain
-  * global_dirty_limit for tracking slowly down to the knocked down dirty
-  * threshold.
-  */
- static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+ static void update_dirty_limit(struct dirty_throttle_control *dtc)
  {
-       unsigned long limit = global_dirty_limit;
+       struct wb_domain *dom = dtc_dom(dtc);
+       unsigned long thresh = dtc->thresh;
+       unsigned long limit = dom->dirty_limit;
  
        /*
         * Follow up in one step.
        /*
         * Follow down slowly. Use the higher one as the target, because thresh
         * may drop below dirty. This is exactly the reason to introduce
-        * global_dirty_limit which is guaranteed to lie above the dirty pages.
+        * dom->dirty_limit which is guaranteed to lie above the dirty pages.
         */
-       thresh = max(thresh, dirty);
+       thresh = max(thresh, dtc->dirty);
        if (limit > thresh) {
                limit -= (limit - thresh) >> 5;
                goto update;
        }
        return;
  update:
-       global_dirty_limit = limit;
+       dom->dirty_limit = limit;
  }
  
- static void global_update_bandwidth(unsigned long thresh,
-                                   unsigned long dirty,
+ static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
                                    unsigned long now)
  {
-       static DEFINE_SPINLOCK(dirty_lock);
-       static unsigned long update_time = INITIAL_JIFFIES;
+       struct wb_domain *dom = dtc_dom(dtc);
  
        /*
         * check locklessly first to optimize away locking for the most time
         */
-       if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+       if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                return;
  
-       spin_lock(&dirty_lock);
-       if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
-               update_dirty_limit(thresh, dirty);
-               update_time = now;
+       spin_lock(&dom->lock);
+       if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
+               update_dirty_limit(dtc);
+               dom->dirty_limit_tstamp = now;
        }
-       spin_unlock(&dirty_lock);
+       spin_unlock(&dom->lock);
  }
  
  /*
-  * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+  * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
   *
-  * Normal bdi tasks will be curbed at or below it in long term.
+  * Normal wb tasks will be curbed at or below it in long term.
   * Obviously it should be around (write_bw / N) when there are N dd tasks.
   */
- static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
-                                      unsigned long thresh,
-                                      unsigned long bg_thresh,
-                                      unsigned long dirty,
-                                      unsigned long bdi_thresh,
-                                      unsigned long bdi_dirty,
-                                      unsigned long dirtied,
-                                      unsigned long elapsed)
- {
-       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-       unsigned long limit = hard_dirty_limit(thresh);
+ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
+                                     unsigned long dirtied,
+                                     unsigned long elapsed)
+ {
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long dirty = dtc->dirty;
+       unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+       unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
        unsigned long setpoint = (freerun + limit) / 2;
-       unsigned long write_bw = bdi->avg_write_bandwidth;
-       unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long dirty_ratelimit = wb->dirty_ratelimit;
        unsigned long dirty_rate;
        unsigned long task_ratelimit;
        unsigned long balanced_dirty_ratelimit;
-       unsigned long pos_ratio;
        unsigned long step;
        unsigned long x;
  
         * The dirty rate will match the writeout rate in long term, except
         * when dirty pages are truncated by userspace or re-dirtied by FS.
         */
-       dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+       dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
  
-       pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
-                                      bdi_thresh, bdi_dirty);
        /*
         * task_ratelimit reflects each dd's dirty rate for the past 200ms.
         */
        task_ratelimit = (u64)dirty_ratelimit *
-                                       pos_ratio >> RATELIMIT_CALC_SHIFT;
+                                       dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
        task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
  
        /*
         * A linear estimation of the "balanced" throttle rate. The theory is,
-        * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+        * if there are N dd tasks, each throttled at task_ratelimit, the wb's
         * dirty_rate will be measured to be (N * task_ratelimit). So the below
         * formula will yield the balanced rate limit (write_bw / N).
         *
        /*
         * We could safely do this and return immediately:
         *
-        *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+        *      wb->dirty_ratelimit = balanced_dirty_ratelimit;
         *
         * However to get a more stable dirty_ratelimit, the below elaborated
         * code makes use of task_ratelimit to filter out singular points and
        step = 0;
  
        /*
-        * For strictlimit case, calculations above were based on bdi counters
-        * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+        * For strictlimit case, calculations above were based on wb counters
+        * and limits (starting from pos_ratio = wb_position_ratio() and up to
         * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
-        * Hence, to calculate "step" properly, we have to use bdi_dirty as
-        * "dirty" and bdi_setpoint as "setpoint".
+        * Hence, to calculate "step" properly, we have to use wb_dirty as
+        * "dirty" and wb_setpoint as "setpoint".
         *
-        * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
-        * it's possible that bdi_thresh is close to zero due to inactivity
-        * of backing device (see the implementation of bdi_dirty_limit()).
+        * We rampup dirty_ratelimit forcibly if wb_dirty is low because
+        * it's possible that wb_thresh is close to zero due to inactivity
+        * of backing device.
         */
-       if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-               dirty = bdi_dirty;
-               if (bdi_dirty < 8)
-                       setpoint = bdi_dirty + 1;
+       if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+               dirty = dtc->wb_dirty;
+               if (dtc->wb_dirty < 8)
+                       setpoint = dtc->wb_dirty + 1;
                else
-                       setpoint = (bdi_thresh +
-                                   bdi_dirty_limit(bdi, bg_thresh)) / 2;
+                       setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
        }
  
        if (dirty < setpoint) {
-               x = min3(bdi->balanced_dirty_ratelimit,
+               x = min3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit < x)
                        step = x - dirty_ratelimit;
        } else {
-               x = max3(bdi->balanced_dirty_ratelimit,
+               x = max3(wb->balanced_dirty_ratelimit,
                         balanced_dirty_ratelimit, task_ratelimit);
                if (dirty_ratelimit > x)
                        step = dirty_ratelimit - x;
        else
                dirty_ratelimit -= step;
  
-       bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
-       bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+       wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+       wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
  
-       trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+       trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
  }
  
- void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-                           unsigned long thresh,
-                           unsigned long bg_thresh,
-                           unsigned long dirty,
-                           unsigned long bdi_thresh,
-                           unsigned long bdi_dirty,
-                           unsigned long start_time)
+ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+                                 struct dirty_throttle_control *mdtc,
+                                 unsigned long start_time,
+                                 bool update_ratelimit)
  {
+       struct bdi_writeback *wb = gdtc->wb;
        unsigned long now = jiffies;
-       unsigned long elapsed = now - bdi->bw_time_stamp;
+       unsigned long elapsed = now - wb->bw_time_stamp;
        unsigned long dirtied;
        unsigned long written;
  
+       lockdep_assert_held(&wb->list_lock);
        /*
         * rate-limit, only update once every 200ms.
         */
        if (elapsed < BANDWIDTH_INTERVAL)
                return;
  
-       dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
-       written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+       dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+       written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
  
        /*
         * Skip quiet periods when disk bandwidth is under-utilized.
         * (at least 1s idle time between two flusher runs)
         */
-       if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+       if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
                goto snapshot;
  
-       if (thresh) {
-               global_update_bandwidth(thresh, dirty, now);
-               bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
-                                          bdi_thresh, bdi_dirty,
-                                          dirtied, elapsed);
+       if (update_ratelimit) {
+               domain_update_bandwidth(gdtc, now);
+               wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+               /*
+                * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+                * compiler has no way to figure that out.  Help it.
+                */
+               if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+                       domain_update_bandwidth(mdtc, now);
+                       wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+               }
        }
-       bdi_update_write_bandwidth(bdi, elapsed, written);
+       wb_update_write_bandwidth(wb, elapsed, written);
  
  snapshot:
-       bdi->dirtied_stamp = dirtied;
-       bdi->written_stamp = written;
-       bdi->bw_time_stamp = now;
+       wb->dirtied_stamp = dirtied;
+       wb->written_stamp = written;
+       wb->bw_time_stamp = now;
  }
  
- static void bdi_update_bandwidth(struct backing_dev_info *bdi,
-                                unsigned long thresh,
-                                unsigned long bg_thresh,
-                                unsigned long dirty,
-                                unsigned long bdi_thresh,
-                                unsigned long bdi_dirty,
-                                unsigned long start_time)
+ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
  {
-       if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
-               return;
-       spin_lock(&bdi->wb.list_lock);
-       __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
-                              bdi_thresh, bdi_dirty, start_time);
-       spin_unlock(&bdi->wb.list_lock);
+       struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+       __wb_update_bandwidth(&gdtc, NULL, start_time, false);
  }
  
  /*
@@@ -1187,10 -1366,10 +1366,10 @@@ static unsigned long dirty_poll_interva
        return 1;
  }
  
- static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
-                                  unsigned long bdi_dirty)
+ static unsigned long wb_max_pause(struct bdi_writeback *wb,
+                                 unsigned long wb_dirty)
  {
-       unsigned long bw = bdi->avg_write_bandwidth;
+       unsigned long bw = wb->avg_write_bandwidth;
        unsigned long t;
  
        /*
         *
         * 8 serves as the safety ratio.
         */
-       t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+       t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
        t++;
  
        return min_t(unsigned long, t, MAX_PAUSE);
  }
  
- static long bdi_min_pause(struct backing_dev_info *bdi,
-                         long max_pause,
-                         unsigned long task_ratelimit,
-                         unsigned long dirty_ratelimit,
-                         int *nr_dirtied_pause)
+ static long wb_min_pause(struct bdi_writeback *wb,
+                        long max_pause,
+                        unsigned long task_ratelimit,
+                        unsigned long dirty_ratelimit,
+                        int *nr_dirtied_pause)
  {
-       long hi = ilog2(bdi->avg_write_bandwidth);
-       long lo = ilog2(bdi->dirty_ratelimit);
+       long hi = ilog2(wb->avg_write_bandwidth);
+       long lo = ilog2(wb->dirty_ratelimit);
        long t;         /* target pause */
        long pause;     /* estimated next pause */
        int pages;      /* target nr_dirtied_pause */
        return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
  }
  
- static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
-                                   unsigned long dirty_thresh,
-                                   unsigned long background_thresh,
-                                   unsigned long *bdi_dirty,
-                                   unsigned long *bdi_thresh,
-                                   unsigned long *bdi_bg_thresh)
+ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
  {
-       unsigned long bdi_reclaimable;
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long wb_reclaimable;
  
        /*
-        * bdi_thresh is not treated as some limiting factor as
+        * wb_thresh is not treated as some limiting factor as
         * dirty_thresh, due to reasons
-        * - in JBOD setup, bdi_thresh can fluctuate a lot
+        * - in JBOD setup, wb_thresh can fluctuate a lot
         * - in a system with HDD and USB key, the USB key may somehow
-        *   go into state (bdi_dirty >> bdi_thresh) either because
-        *   bdi_dirty starts high, or because bdi_thresh drops low.
+        *   go into state (wb_dirty >> wb_thresh) either because
+        *   wb_dirty starts high, or because wb_thresh drops low.
         *   In this case we don't want to hard throttle the USB key
-        *   dirtiers for 100 seconds until bdi_dirty drops under
-        *   bdi_thresh. Instead the auxiliary bdi control line in
-        *   bdi_position_ratio() will let the dirtier task progress
-        *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+        *   dirtiers for 100 seconds until wb_dirty drops under
+        *   wb_thresh. Instead the auxiliary wb control line in
+        *   wb_position_ratio() will let the dirtier task progress
+        *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
         */
-       *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-       if (bdi_bg_thresh)
-               *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
-                                                       background_thresh,
-                                                       dirty_thresh) : 0;
+       dtc->wb_thresh = __wb_calc_thresh(dtc);
+       dtc->wb_bg_thresh = dtc->thresh ?
+               div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
  
        /*
         * In order to avoid the stacked BDI deadlock we need
         * actually dirty; with m+n sitting in the percpu
         * deltas.
         */
-       if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
-               bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-               *bdi_dirty = bdi_reclaimable +
-                       bdi_stat_sum(bdi, BDI_WRITEBACK);
+       if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+               wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+               dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
        } else {
-               bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-               *bdi_dirty = bdi_reclaimable +
-                       bdi_stat(bdi, BDI_WRITEBACK);
+               wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+               dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
        }
  }
  
   * perform some writeout.
   */
  static void balance_dirty_pages(struct address_space *mapping,
+                               struct bdi_writeback *wb,
                                unsigned long pages_dirtied)
  {
+       struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+       struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+       struct dirty_throttle_control * const gdtc = &gdtc_stor;
+       struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+                                                    &mdtc_stor : NULL;
+       struct dirty_throttle_control *sdtc;
        unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
-       unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
-       unsigned long background_thresh;
-       unsigned long dirty_thresh;
        long period;
        long pause;
        long max_pause;
        bool dirty_exceeded = false;
        unsigned long task_ratelimit;
        unsigned long dirty_ratelimit;
-       unsigned long pos_ratio;
-       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+       struct backing_dev_info *bdi = wb->bdi;
        bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
        unsigned long start_time = jiffies;
  
        for (;;) {
                unsigned long now = jiffies;
-               unsigned long uninitialized_var(bdi_thresh);
-               unsigned long thresh;
-               unsigned long uninitialized_var(bdi_dirty);
-               unsigned long dirty;
-               unsigned long bg_thresh;
+               unsigned long dirty, thresh, bg_thresh;
+               unsigned long m_dirty, m_thresh, m_bg_thresh;
  
                /*
                 * Unstable writes are a feature of certain networked
                 */
                nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                        global_page_state(NR_UNSTABLE_NFS);
-               nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+               gdtc->avail = global_dirtyable_memory();
+               gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
  
-               global_dirty_limits(&background_thresh, &dirty_thresh);
+               domain_dirty_limits(gdtc);
  
                if (unlikely(strictlimit)) {
-                       bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-                                        &bdi_dirty, &bdi_thresh, &bg_thresh);
+                       wb_dirty_limits(gdtc);
  
-                       dirty = bdi_dirty;
-                       thresh = bdi_thresh;
+                       dirty = gdtc->wb_dirty;
+                       thresh = gdtc->wb_thresh;
+                       bg_thresh = gdtc->wb_bg_thresh;
                } else {
-                       dirty = nr_dirty;
-                       thresh = dirty_thresh;
-                       bg_thresh = background_thresh;
+                       dirty = gdtc->dirty;
+                       thresh = gdtc->thresh;
+                       bg_thresh = gdtc->bg_thresh;
+               }
+               if (mdtc) {
+                       unsigned long writeback;
+                       /*
+                        * If @wb belongs to !root memcg, repeat the same
+                        * basic calculations for the memcg domain.
+                        */
+                       mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
+                                           &writeback);
+                       mdtc_cap_avail(mdtc);
+                       mdtc->dirty += writeback;
+                       domain_dirty_limits(mdtc);
+                       if (unlikely(strictlimit)) {
+                               wb_dirty_limits(mdtc);
+                               m_dirty = mdtc->wb_dirty;
+                               m_thresh = mdtc->wb_thresh;
+                               m_bg_thresh = mdtc->wb_bg_thresh;
+                       } else {
+                               m_dirty = mdtc->dirty;
+                               m_thresh = mdtc->thresh;
+                               m_bg_thresh = mdtc->bg_thresh;
+                       }
                }
  
                /*
                 * Throttle it only when the background writeback cannot
                 * catch-up. This avoids (excessively) small writeouts
-                * when the bdi limits are ramping up in case of !strictlimit.
+                * when the wb limits are ramping up in case of !strictlimit.
                 *
-                * In strictlimit case make decision based on the bdi counters
-                * and limits. Small writeouts when the bdi limits are ramping
+                * In strictlimit case make decision based on the wb counters
+                * and limits. Small writeouts when the wb limits are ramping
                 * up are the price we consciously pay for strictlimit-ing.
+                *
+                * If memcg domain is in effect, @dirty should be under
+                * both global and memcg freerun ceilings.
                 */
-               if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+               if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+                   (!mdtc ||
+                    m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+                       unsigned long intv = dirty_poll_interval(dirty, thresh);
+                       unsigned long m_intv = ULONG_MAX;
                        current->dirty_paused_when = now;
                        current->nr_dirtied = 0;
-                       current->nr_dirtied_pause =
-                               dirty_poll_interval(dirty, thresh);
+                       if (mdtc)
+                               m_intv = dirty_poll_interval(m_dirty, m_thresh);
+                       current->nr_dirtied_pause = min(intv, m_intv);
                        break;
                }
  
-               if (unlikely(!writeback_in_progress(bdi)))
-                       bdi_start_background_writeback(bdi);
+               if (unlikely(!writeback_in_progress(wb)))
+                       wb_start_background_writeback(wb);
  
+               /*
+                * Calculate global domain's pos_ratio and select the
+                * global dtc by default.
+                */
                if (!strictlimit)
-                       bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-                                        &bdi_dirty, &bdi_thresh, NULL);
-               dirty_exceeded = (bdi_dirty > bdi_thresh) &&
-                                ((nr_dirty > dirty_thresh) || strictlimit);
-               if (dirty_exceeded && !bdi->dirty_exceeded)
-                       bdi->dirty_exceeded = 1;
-               bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
-                                    nr_dirty, bdi_thresh, bdi_dirty,
-                                    start_time);
-               dirty_ratelimit = bdi->dirty_ratelimit;
-               pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
-                                              background_thresh, nr_dirty,
-                                              bdi_thresh, bdi_dirty);
-               task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+                       wb_dirty_limits(gdtc);
+               dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+                       ((gdtc->dirty > gdtc->thresh) || strictlimit);
+               wb_position_ratio(gdtc);
+               sdtc = gdtc;
+               if (mdtc) {
+                       /*
+                        * If memcg domain is in effect, calculate its
+                        * pos_ratio.  @wb should satisfy constraints from
+                        * both global and memcg domains.  Choose the one
+                        * w/ lower pos_ratio.
+                        */
+                       if (!strictlimit)
+                               wb_dirty_limits(mdtc);
+                       dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+                               ((mdtc->dirty > mdtc->thresh) || strictlimit);
+                       wb_position_ratio(mdtc);
+                       if (mdtc->pos_ratio < gdtc->pos_ratio)
+                               sdtc = mdtc;
+               }
+               if (dirty_exceeded && !wb->dirty_exceeded)
+                       wb->dirty_exceeded = 1;
+               if (time_is_before_jiffies(wb->bw_time_stamp +
+                                          BANDWIDTH_INTERVAL)) {
+                       spin_lock(&wb->list_lock);
+                       __wb_update_bandwidth(gdtc, mdtc, start_time, true);
+                       spin_unlock(&wb->list_lock);
+               }
+               /* throttle according to the chosen dtc */
+               dirty_ratelimit = wb->dirty_ratelimit;
+               task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                        RATELIMIT_CALC_SHIFT;
-               max_pause = bdi_max_pause(bdi, bdi_dirty);
-               min_pause = bdi_min_pause(bdi, max_pause,
-                                         task_ratelimit, dirty_ratelimit,
-                                         &nr_dirtied_pause);
+               max_pause = wb_max_pause(wb, sdtc->wb_dirty);
+               min_pause = wb_min_pause(wb, max_pause,
+                                        task_ratelimit, dirty_ratelimit,
+                                        &nr_dirtied_pause);
  
                if (unlikely(task_ratelimit == 0)) {
                        period = max_pause;
                 */
                if (pause < min_pause) {
                        trace_balance_dirty_pages(bdi,
-                                                 dirty_thresh,
-                                                 background_thresh,
-                                                 nr_dirty,
-                                                 bdi_thresh,
-                                                 bdi_dirty,
+                                                 sdtc->thresh,
+                                                 sdtc->bg_thresh,
+                                                 sdtc->dirty,
+                                                 sdtc->wb_thresh,
+                                                 sdtc->wb_dirty,
                                                  dirty_ratelimit,
                                                  task_ratelimit,
                                                  pages_dirtied,
  
  pause:
                trace_balance_dirty_pages(bdi,
-                                         dirty_thresh,
-                                         background_thresh,
-                                         nr_dirty,
-                                         bdi_thresh,
-                                         bdi_dirty,
+                                         sdtc->thresh,
+                                         sdtc->bg_thresh,
+                                         sdtc->dirty,
+                                         sdtc->wb_thresh,
+                                         sdtc->wb_dirty,
                                          dirty_ratelimit,
                                          task_ratelimit,
                                          pages_dirtied,
                current->nr_dirtied_pause = nr_dirtied_pause;
  
                /*
-                * This is typically equal to (nr_dirty < dirty_thresh) and can
-                * also keep "1000+ dd on a slow USB stick" under control.
+                * This is typically equal to (dirty < thresh) and can also
+                * keep "1000+ dd on a slow USB stick" under control.
                 */
                if (task_ratelimit)
                        break;
  
                /*
                 * In the case of an unresponding NFS server and the NFS dirty
-                * pages exceeds dirty_thresh, give the other good bdi's a pipe
+                * pages exceeds dirty_thresh, give the other good wb's a pipe
                 * to go through, so that tasks on them still remain responsive.
                 *
                 * In theory 1 page is enough to keep the comsumer-producer
                 * pipe going: the flusher cleans 1 page => the task dirties 1
-                * more page. However bdi_dirty has accounting errors.  So use
-                * the larger and more IO friendly bdi_stat_error.
+                * more page. However wb_dirty has accounting errors.  So use
+                * the larger and more IO friendly wb_stat_error.
                 */
-               if (bdi_dirty <= bdi_stat_error(bdi))
+               if (sdtc->wb_dirty <= wb_stat_error(wb))
                        break;
  
                if (fatal_signal_pending(current))
                        break;
        }
  
-       if (!dirty_exceeded && bdi->dirty_exceeded)
-               bdi->dirty_exceeded = 0;
+       if (!dirty_exceeded && wb->dirty_exceeded)
+               wb->dirty_exceeded = 0;
  
-       if (writeback_in_progress(bdi))
+       if (writeback_in_progress(wb))
                return;
  
        /*
        if (laptop_mode)
                return;
  
-       if (nr_reclaimable > background_thresh)
-               bdi_start_background_writeback(bdi);
+       if (nr_reclaimable > gdtc->bg_thresh)
+               wb_start_background_writeback(wb);
  }
  
  static DEFINE_PER_CPU(int, bdp_ratelimits);
@@@ -1577,15 -1809,22 +1809,22 @@@ DEFINE_PER_CPU(int, dirty_throttle_leak
   */
  void balance_dirty_pages_ratelimited(struct address_space *mapping)
  {
-       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+       struct inode *inode = mapping->host;
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct bdi_writeback *wb = NULL;
        int ratelimit;
        int *p;
  
        if (!bdi_cap_account_dirty(bdi))
                return;
  
+       if (inode_cgwb_enabled(inode))
+               wb = wb_get_create_current(bdi, GFP_KERNEL);
+       if (!wb)
+               wb = &bdi->wb;
        ratelimit = current->nr_dirtied_pause;
-       if (bdi->dirty_exceeded)
+       if (wb->dirty_exceeded)
                ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
  
        preempt_disable();
        preempt_enable();
  
        if (unlikely(current->nr_dirtied >= ratelimit))
-               balance_dirty_pages(mapping, current->nr_dirtied);
+               balance_dirty_pages(mapping, wb, current->nr_dirtied);
+       wb_put(wb);
  }
  EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
  
+ /**
+  * wb_over_bg_thresh - does @wb need to be written back?
+  * @wb: bdi_writeback of interest
+  *
+  * Determines whether background writeback should keep writing @wb or it's
+  * clean enough.  Returns %true if writeback should continue.
+  */
+ bool wb_over_bg_thresh(struct bdi_writeback *wb)
+ {
+       struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+       struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+       struct dirty_throttle_control * const gdtc = &gdtc_stor;
+       struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+                                                    &mdtc_stor : NULL;
+       /*
+        * Similar to balance_dirty_pages() but ignores pages being written
+        * as we're trying to decide whether to put more under writeback.
+        */
+       gdtc->avail = global_dirtyable_memory();
+       gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
+                     global_page_state(NR_UNSTABLE_NFS);
+       domain_dirty_limits(gdtc);
+       if (gdtc->dirty > gdtc->bg_thresh)
+               return true;
+       if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+               return true;
+       if (mdtc) {
+               unsigned long writeback;
+               mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
+               mdtc_cap_avail(mdtc);
+               domain_dirty_limits(mdtc);      /* ditto, ignore writeback */
+               if (mdtc->dirty > mdtc->bg_thresh)
+                       return true;
+               if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+                       return true;
+       }
+       return false;
+ }
  void throttle_vm_writeout(gfp_t gfp_mask)
  {
        unsigned long background_thresh;
  
          for ( ; ; ) {
                global_dirty_limits(&background_thresh, &dirty_thresh);
-               dirty_thresh = hard_dirty_limit(dirty_thresh);
+               dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
  
                  /*
                   * Boost the allowable dirty threshold a bit for page
@@@ -1667,14 -1955,20 +1955,20 @@@ void laptop_mode_timer_fn(unsigned lon
        struct request_queue *q = (struct request_queue *)data;
        int nr_pages = global_page_state(NR_FILE_DIRTY) +
                global_page_state(NR_UNSTABLE_NFS);
+       struct bdi_writeback *wb;
+       struct wb_iter iter;
  
        /*
         * We want to write everything out, not just down to the dirty
         * threshold
         */
-       if (bdi_has_dirty_io(&q->backing_dev_info))
-               bdi_start_writeback(&q->backing_dev_info, nr_pages,
-                                       WB_REASON_LAPTOP_TIMER);
+       if (!bdi_has_dirty_io(&q->backing_dev_info))
+               return;
+       bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+               if (wb_has_dirty_io(wb))
+                       wb_start_writeback(wb, nr_pages, true,
+                                          WB_REASON_LAPTOP_TIMER);
  }
  
  /*
@@@ -1718,10 -2012,12 +2012,12 @@@ void laptop_sync_completion(void
  
  void writeback_set_ratelimit(void)
  {
+       struct wb_domain *dom = &global_wb_domain;
        unsigned long background_thresh;
        unsigned long dirty_thresh;
        global_dirty_limits(&background_thresh, &dirty_thresh);
-       global_dirty_limit = dirty_thresh;
+       dom->dirty_limit = dirty_thresh;
        ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
        if (ratelimit_pages < 16)
                ratelimit_pages = 16;
@@@ -1770,7 -2066,7 +2066,7 @@@ void __init page_writeback_init(void
        writeback_set_ratelimit();
        register_cpu_notifier(&ratelimit_nb);
  
-       fprop_global_init(&writeout_completions, GFP_KERNEL);
+       BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
  }
  
  /**
@@@ -2090,19 -2386,29 +2386,29 @@@ int __set_page_dirty_no_writeback(struc
  
  /*
   * Helper function for set_page_dirty family.
+  *
+  * Caller must hold mem_cgroup_begin_page_stat().
+  *
   * NOTE: This relies on being atomic wrt interrupts.
   */
- void account_page_dirtied(struct page *page, struct address_space *mapping)
+ void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg)
  {
+       struct inode *inode = mapping->host;
        trace_writeback_dirty_page(page, mapping);
  
        if (mapping_cap_account_dirty(mapping)) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct bdi_writeback *wb;
  
+               inode_attach_wb(inode, page);
+               wb = inode_to_wb(inode);
+               mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                __inc_zone_page_state(page, NR_FILE_DIRTY);
                __inc_zone_page_state(page, NR_DIRTIED);
-               __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
-               __inc_bdi_stat(bdi, BDI_DIRTIED);
+               __inc_wb_stat(wb, WB_RECLAIMABLE);
+               __inc_wb_stat(wb, WB_DIRTIED);
                task_io_account_write(PAGE_CACHE_SIZE);
                current->nr_dirtied++;
                this_cpu_inc(bdp_ratelimits);
@@@ -2113,21 -2419,18 +2419,18 @@@ EXPORT_SYMBOL(account_page_dirtied)
  /*
   * Helper function for deaccounting dirty page without writeback.
   *
-  * Doing this should *normally* only ever be done when a page
-  * is truncated, and is not actually mapped anywhere at all. However,
-  * fs/buffer.c does this when it notices that somebody has cleaned
-  * out all the buffers on a page without actually doing it through
-  * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+  * Caller must hold mem_cgroup_begin_page_stat().
   */
- void account_page_cleaned(struct page *page, struct address_space *mapping)
+ void account_page_cleaned(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg, struct bdi_writeback *wb)
  {
        if (mapping_cap_account_dirty(mapping)) {
+               mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                dec_zone_page_state(page, NR_FILE_DIRTY);
-               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+               dec_wb_stat(wb, WB_RECLAIMABLE);
                task_io_account_cancelled_write(PAGE_CACHE_SIZE);
        }
  }
- EXPORT_SYMBOL(account_page_cleaned);
  
  /*
   * For address_spaces which do not use buffers.  Just tag the page as dirty in
   */
  int __set_page_dirty_nobuffers(struct page *page)
  {
+       struct mem_cgroup *memcg;
+       memcg = mem_cgroup_begin_page_stat(page);
        if (!TestSetPageDirty(page)) {
                struct address_space *mapping = page_mapping(page);
                unsigned long flags;
  
-               if (!mapping)
+               if (!mapping) {
+                       mem_cgroup_end_page_stat(memcg);
                        return 1;
+               }
  
                spin_lock_irqsave(&mapping->tree_lock, flags);
                BUG_ON(page_mapping(page) != mapping);
                WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-               account_page_dirtied(page, mapping);
+               account_page_dirtied(page, mapping, memcg);
                radix_tree_tag_set(&mapping->page_tree, page_index(page),
                                   PAGECACHE_TAG_DIRTY);
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                if (mapping->host) {
                        /* !PageAnon && !swapper_space */
                        __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
                }
                return 1;
        }
+       mem_cgroup_end_page_stat(memcg);
        return 0;
  }
  EXPORT_SYMBOL(__set_page_dirty_nobuffers);
  void account_page_redirty(struct page *page)
  {
        struct address_space *mapping = page->mapping;
        if (mapping && mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               bool locked;
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
                current->nr_dirtied--;
                dec_zone_page_state(page, NR_DIRTIED);
-               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
+               dec_wb_stat(wb, WB_DIRTIED);
+               unlocked_inode_to_wb_end(inode, locked);
        }
  }
  EXPORT_SYMBOL(account_page_redirty);
@@@ -2265,6 -2583,43 +2583,43 @@@ int set_page_dirty_lock(struct page *pa
  }
  EXPORT_SYMBOL(set_page_dirty_lock);
  
+ /*
+  * This cancels just the dirty bit on the kernel page itself, it does NOT
+  * actually remove dirty bits on any mmap's that may be around. It also
+  * leaves the page tagged dirty, so any sync activity will still find it on
+  * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+  * look at the dirty bits in the VM.
+  *
+  * Doing this should *normally* only ever be done when a page is truncated,
+  * and is not actually mapped anywhere at all. However, fs/buffer.c does
+  * this when it notices that somebody has cleaned out all the buffers on a
+  * page without actually doing it through the VM. Can you say "ext3 is
+  * horribly ugly"? Thought you could.
+  */
+ void cancel_dirty_page(struct page *page)
+ {
+       struct address_space *mapping = page_mapping(page);
+       if (mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               struct mem_cgroup *memcg;
+               bool locked;
+               memcg = mem_cgroup_begin_page_stat(page);
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
+               if (TestClearPageDirty(page))
+                       account_page_cleaned(page, mapping, memcg, wb);
+               unlocked_inode_to_wb_end(inode, locked);
+               mem_cgroup_end_page_stat(memcg);
+       } else {
+               ClearPageDirty(page);
+       }
+ }
+ EXPORT_SYMBOL(cancel_dirty_page);
  /*
   * Clear a page's dirty flag, while caring for dirty memory accounting.
   * Returns true if the page was previously dirty.
  int clear_page_dirty_for_io(struct page *page)
  {
        struct address_space *mapping = page_mapping(page);
+       int ret = 0;
  
        BUG_ON(!PageLocked(page));
  
        if (mapping && mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               struct mem_cgroup *memcg;
+               bool locked;
                /*
                 * Yes, Virginia, this is indeed insane.
                 *
                 * always locked coming in here, so we get the desired
                 * exclusion.
                 */
+               memcg = mem_cgroup_begin_page_stat(page);
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
                if (TestClearPageDirty(page)) {
+                       mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                        dec_zone_page_state(page, NR_FILE_DIRTY);
-                       dec_bdi_stat(inode_to_bdi(mapping->host),
-                                       BDI_RECLAIMABLE);
-                       return 1;
+                       dec_wb_stat(wb, WB_RECLAIMABLE);
+                       ret = 1;
                }
-               return 0;
+               unlocked_inode_to_wb_end(inode, locked);
+               mem_cgroup_end_page_stat(memcg);
+               return ret;
        }
        return TestClearPageDirty(page);
  }
@@@ -2341,7 -2706,8 +2706,8 @@@ int test_clear_page_writeback(struct pa
  
        memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct inode *inode = mapping->host;
+               struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
  
                spin_lock_irqsave(&mapping->tree_lock, flags);
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
                        if (bdi_cap_account_writeback(bdi)) {
-                               __dec_bdi_stat(bdi, BDI_WRITEBACK);
-                               __bdi_writeout_inc(bdi);
+                               struct bdi_writeback *wb = inode_to_wb(inode);
+                               __dec_wb_stat(wb, WB_WRITEBACK);
+                               __wb_writeout_inc(wb);
                        }
                }
                spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@@ -2376,7 -2744,8 +2744,8 @@@ int __test_set_page_writeback(struct pa
  
        memcg = mem_cgroup_begin_page_stat(page);
        if (mapping) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct inode *inode = mapping->host;
+               struct backing_dev_info *bdi = inode_to_bdi(inode);
                unsigned long flags;
  
                spin_lock_irqsave(&mapping->tree_lock, flags);
                                                page_index(page),
                                                PAGECACHE_TAG_WRITEBACK);
                        if (bdi_cap_account_writeback(bdi))
-                               __inc_bdi_stat(bdi, BDI_WRITEBACK);
+                               __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
                }
                if (!PageDirty(page))
                        radix_tree_tag_clear(&mapping->page_tree,
diff --combined mm/rmap.c
index 7af1ecb21ccb2d560ca9f0f21e002a26465737f4,8fc556ce2dcb7f538c81a7c590c769d9da34aa6a..171b68768df1478355bcddd5e30c2edd616ba05b
+++ b/mm/rmap.c
@@@ -30,6 -30,8 +30,8 @@@
   *             swap_lock (in swap_duplicate, swap_info_get)
   *               mmlist_lock (in mmput, drain_mmlist and others)
   *               mapping->private_lock (in __set_page_dirty_buffers)
+  *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+  *                   mapping->tree_lock (widely used)
   *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
   *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
   *                 sb_lock (within inode_lock in fs/fs-writeback.c)
@@@ -625,7 -627,7 +627,7 @@@ pmd_t *mm_find_pmd(struct mm_struct *mm
  
        pmd = pmd_offset(pud, address);
        /*
 -       * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
 +       * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
         * without holding anon_vma lock for write.  So when looking for a
         * genuine pmde (in which to find pte), test present and !THP together.
         */
@@@ -950,12 -952,7 +952,12 @@@ void page_move_anon_rmap(struct page *p
        VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
  
        anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
 -      page->mapping = (struct address_space *) anon_vma;
 +      /*
 +       * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
 +       * simultaneously, so a concurrent reader (eg page_referenced()'s
 +       * PageAnon()) will not see one without the other.
 +       */
 +      WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
  }
  
  /**
diff --combined mm/vmscan.c
index 19ef01e90ac42077c3d7898d5ef0d149a166b1aa,8cb16ebaf3ed083a73fc9ed8104a20de71e77dde..e61445dce04e3cc83e9704e84f3d5bf9074b31db
@@@ -154,11 -154,42 +154,42 @@@ static bool global_reclaim(struct scan_
  {
        return !sc->target_mem_cgroup;
  }
+ /**
+  * sane_reclaim - is the usual dirty throttling mechanism operational?
+  * @sc: scan_control in question
+  *
+  * The normal page dirty throttling mechanism in balance_dirty_pages() is
+  * completely broken with the legacy memcg and direct stalling in
+  * shrink_page_list() is used for throttling instead, which lacks all the
+  * niceties such as fairness, adaptive pausing, bandwidth proportional
+  * allocation and configurability.
+  *
+  * This function tests whether the vmscan currently in progress can assume
+  * that the normal dirty throttling mechanism is operational.
+  */
+ static bool sane_reclaim(struct scan_control *sc)
+ {
+       struct mem_cgroup *memcg = sc->target_mem_cgroup;
+       if (!memcg)
+               return true;
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+               return true;
+ #endif
+       return false;
+ }
  #else
  static bool global_reclaim(struct scan_control *sc)
  {
        return true;
  }
+ static bool sane_reclaim(struct scan_control *sc)
+ {
+       return true;
+ }
  #endif
  
  static unsigned long zone_reclaimable_pages(struct zone *zone)
@@@ -452,14 -483,13 +483,13 @@@ static inline int is_page_cache_freeabl
        return page_count(page) - page_has_private(page) == 2;
  }
  
- static int may_write_to_queue(struct backing_dev_info *bdi,
-                             struct scan_control *sc)
+ static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
  {
        if (current->flags & PF_SWAPWRITE)
                return 1;
-       if (!bdi_write_congested(bdi))
+       if (!inode_write_congested(inode))
                return 1;
-       if (bdi == current->backing_dev_info)
+       if (inode_to_bdi(inode) == current->backing_dev_info)
                return 1;
        return 0;
  }
@@@ -538,7 -568,7 +568,7 @@@ static pageout_t pageout(struct page *p
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-       if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+       if (!may_write_to_inode(mapping->host, sc))
                return PAGE_KEEP;
  
        if (clear_page_dirty_for_io(page)) {
  static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed)
  {
+       unsigned long flags;
+       struct mem_cgroup *memcg;
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
  
-       spin_lock_irq(&mapping->tree_lock);
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
        /*
         * The non racy check for a busy page.
         *
                swp_entry_t swap = { .val = page_private(page) };
                mem_cgroup_swapout(page, swap);
                __delete_from_swap_cache(page);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                swapcache_free(swap);
        } else {
                void (*freepage)(struct page *);
                if (reclaimed && page_is_file_cache(page) &&
                    !mapping_exiting(mapping))
                        shadow = workingset_eviction(mapping, page);
-               __delete_from_page_cache(page, shadow);
-               spin_unlock_irq(&mapping->tree_lock);
+               __delete_from_page_cache(page, shadow, memcg);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
  
                if (freepage != NULL)
                        freepage(page);
        return 1;
  
  cannot_free:
-       spin_unlock_irq(&mapping->tree_lock);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
        return 0;
  }
  
@@@ -917,7 -954,7 +954,7 @@@ static unsigned long shrink_page_list(s
                 */
                mapping = page_mapping(page);
                if (((dirty || writeback) && mapping &&
-                    bdi_write_congested(inode_to_bdi(mapping->host))) ||
+                    inode_write_congested(mapping->host)) ||
                    (writeback && PageReclaim(page)))
                        nr_congested++;
  
                 *    note that the LRU is being scanned too quickly and the
                 *    caller can stall after page list has been processed.
                 *
-                * 2) Global reclaim encounters a page, memcg encounters a
-                *    page that is not marked for immediate reclaim or
-                *    the caller does not have __GFP_IO. In this case mark
-                *    the page for immediate reclaim and continue scanning.
+                * 2) Global or new memcg reclaim encounters a page that is
+                *    not marked for immediate reclaim or the caller does not
+                *    have __GFP_IO. In this case mark the page for immediate
+                *    reclaim and continue scanning.
                 *
                 *    __GFP_IO is checked  because a loop driver thread might
                 *    enter reclaim, and deadlock if it waits on a page for
                 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
                 *    may_enter_fs here is liable to OOM on them.
                 *
-                * 3) memcg encounters a page that is not already marked
+                * 3) Legacy memcg encounters a page that is not already marked
                 *    PageReclaim. memcg does not have any dirty pages
                 *    throttling so we could easily OOM just because too many
                 *    pages are in writeback and there is nothing else to
                                goto keep_locked;
  
                        /* Case 2 above */
-                       } else if (global_reclaim(sc) ||
+                       } else if (sane_reclaim(sc) ||
                            !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@@ -1416,7 -1453,7 +1453,7 @@@ static int too_many_isolated(struct zon
        if (current_is_kswapd())
                return 0;
  
-       if (!global_reclaim(sc))
+       if (!sane_reclaim(sc))
                return 0;
  
        if (file) {
@@@ -1608,10 -1645,10 +1645,10 @@@ shrink_inactive_list(unsigned long nr_t
                set_bit(ZONE_WRITEBACK, &zone->flags);
  
        /*
-        * memcg will stall in page writeback so only consider forcibly
-        * stalling for global reclaim
+        * Legacy memcg will stall in page writeback so avoid forcibly
+        * stalling here.
         */
-       if (global_reclaim(sc)) {
+       if (sane_reclaim(sc)) {
                /*
                 * Tag a zone as congested if all the dirty pages scanned were
                 * backed by a congested BDI and wait_iff_congested will stall.
@@@ -2646,8 -2683,7 +2683,8 @@@ static bool pfmemalloc_watermark_ok(pg_
  
        for (i = 0; i <= ZONE_NORMAL; i++) {
                zone = &pgdat->node_zones[i];
 -              if (!populated_zone(zone))
 +              if (!populated_zone(zone) ||
 +                  zone_reclaimable_pages(zone) == 0)
                        continue;
  
                pfmemalloc_reserve += min_wmark_pages(zone);
@@@ -3597,7 -3633,7 +3634,7 @@@ int zone_reclaim_mode __read_mostly
  #define RECLAIM_OFF 0
  #define RECLAIM_ZONE (1<<0)   /* Run shrink_inactive_list on the zone */
  #define RECLAIM_WRITE (1<<1)  /* Writeout pages during reclaim */
 -#define RECLAIM_SWAP (1<<2)   /* Swap pages out during reclaim */
 +#define RECLAIM_UNMAP (1<<2)  /* Unmap pages during reclaim */
  
  /*
   * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@@ -3639,12 -3675,12 +3676,12 @@@ static long zone_pagecache_reclaimable(
        long delta = 0;
  
        /*
 -       * If RECLAIM_SWAP is set, then all file pages are considered
 +       * If RECLAIM_UNMAP is set, then all file pages are considered
         * potentially reclaimable. Otherwise, we have to worry about
         * pages like swapcache and zone_unmapped_file_pages() provides
         * a better estimate
         */
 -      if (zone_reclaim_mode & RECLAIM_SWAP)
 +      if (zone_reclaim_mode & RECLAIM_UNMAP)
                nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
        else
                nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@@ -3675,15 -3711,15 +3712,15 @@@ static int __zone_reclaim(struct zone *
                .order = order,
                .priority = ZONE_RECLAIM_PRIORITY,
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 -              .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 +              .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
                .may_swap = 1,
        };
  
        cond_resched();
        /*
 -       * We need to be able to allocate from the reserves for RECLAIM_SWAP
 +       * We need to be able to allocate from the reserves for RECLAIM_UNMAP
         * and we also need to be able to write out pages for RECLAIM_WRITE
 -       * and RECLAIM_SWAP.
 +       * and RECLAIM_UNMAP.
         */
        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
        lockdep_set_current_reclaim_state(gfp_mask);