Merge branch 'for-4.2/writeback' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
diff --combined block/blk-cgroup.c

index 6e43fa355e7127e8e2b10ff33eee5c0ab43ccf90,31610ae0ebff2bcbd6b9d80da9f04e08bcc1b697..9f97da52d006281b1ab3e2911d85934216e3931a
--- 1/block/blk-cgroup.c
--- 2/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@@ -9,30 -9,30 +9,33 @@@
    *
    * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
    *                  Nauman Rafique <nauman@google.com>
+ + *
+ + * For policy-specific per-blkcg data:
+ + * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ + *                    Arianna Avanzini <avanzini.arianna@gmail.com>
    */
   #include <linux/ioprio.h>
   #include <linux/kdev_t.h>
   #include <linux/module.h>
   #include <linux/err.h>
   #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
   #include <linux/slab.h>
   #include <linux/genhd.h>
   #include <linux/delay.h>
   #include <linux/atomic.h>
- #include "blk-cgroup.h"
+ #include <linux/blk-cgroup.h>
   #include "blk.h"
   
   #define MAX_KEY_LEN 100
   
   static DEFINE_MUTEX(blkcg_pol_mutex);
   
- -struct blkcg blkcg_root = { .cfq_weight = 2 * CFQ_WEIGHT_DEFAULT,
- -                          .cfq_leaf_weight = 2 * CFQ_WEIGHT_DEFAULT, };
+ +struct blkcg blkcg_root;
   EXPORT_SYMBOL_GPL(blkcg_root);
   
+ struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+ 
   static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
   
   static bool blkcg_policy_enabled(struct request_queue *q,
@@@ -182,6 -182,7 +185,7 @@@ static struct blkcg_gq *blkg_create(str
                                     struct blkcg_gq *new_blkg)
   {
         struct blkcg_gq *blkg;
+       struct bdi_writeback_congested *wb_congested;
         int i, ret;
   
         WARN_ON_ONCE(!rcu_read_lock_held());
@@@ -193,22 -194,30 +197,30 @@@
                 goto err_free_blkg;
         }
   
+       wb_congested = wb_congested_get_create(&q->backing_dev_info,
+                                              blkcg->css.id, GFP_ATOMIC);
+       if (!wb_congested) {
+               ret = -ENOMEM;
+               goto err_put_css;
+       }
+ 
         /* allocate */
         if (!new_blkg) {
                 new_blkg = blkg_alloc(blkcg, q, GFP_ATOMIC);
                 if (unlikely(!new_blkg)) {
                         ret = -ENOMEM;
-                       goto err_put_css;
+                       goto err_put_congested;
                 }
         }
         blkg = new_blkg;
+       blkg->wb_congested = wb_congested;
   
         /* link parent */
         if (blkcg_parent(blkcg)) {
                 blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
                 if (WARN_ON_ONCE(!blkg->parent)) {
                         ret = -EINVAL;
-                       goto err_put_css;
+                       goto err_put_congested;
                 }
                 blkg_get(blkg->parent);
         }
@@@ -238,18 -247,15 +250,15 @@@
         blkg->online = true;
         spin_unlock(&blkcg->lock);
   
-       if (!ret) {
-               if (blkcg == &blkcg_root) {
-                       q->root_blkg = blkg;
-                       q->root_rl.blkg = blkg;
-               }
+       if (!ret)
                 return blkg;
-       }
   
         /* @blkg failed fully initialized, use the usual release path */
         blkg_put(blkg);
         return ERR_PTR(ret);
   
+ err_put_congested:
+       wb_congested_put(wb_congested);
   err_put_css:
         css_put(&blkcg->css);
   err_free_blkg:
@@@ -342,15 -348,6 +351,6 @@@ static void blkg_destroy(struct blkcg_g
         if (rcu_access_pointer(blkcg->blkg_hint) == blkg)
                 rcu_assign_pointer(blkcg->blkg_hint, NULL);
   
-       /*
-        * If root blkg is destroyed.  Just clear the pointer since root_rl
-        * does not take reference on root blkg.
-        */
-       if (blkcg == &blkcg_root) {
-               blkg->q->root_blkg = NULL;
-               blkg->q->root_rl.blkg = NULL;
-       }
- 
         /*
          * Put the reference taken at the time of creation so that when all
          * queues are gone, group can be destroyed.
@@@ -405,6 -402,8 +405,8 @@@ void __blkg_release_rcu(struct rcu_hea
         if (blkg->parent)
                 blkg_put(blkg->parent);
   
+       wb_congested_put(blkg->wb_congested);
+ 
         blkg_free(blkg);
   }
   EXPORT_SYMBOL_GPL(__blkg_release_rcu);
@@@ -812,6 -811,8 +814,8 @@@ static void blkcg_css_offline(struct cg
         }
   
         spin_unlock_irq(&blkcg->lock);
+ 
+       wb_blkcg_offline(blkcg);
   }
   
   static void blkcg_css_free(struct cgroup_subsys_state *css)
@@@ -826,8 -827,6 +830,8 @@@ static struct cgroup_subsys_state 
   blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
   {
         struct blkcg *blkcg;
+ +      struct cgroup_subsys_state *ret;
+ +      int i;
   
         if (!parent_css) {
                 blkcg = &blkcg_root;
@@@ -835,49 -834,19 +839,51 @@@
         }
   
         blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
- -      if (!blkcg)
- -              return ERR_PTR(-ENOMEM);
+ +      if (!blkcg) {
+ +              ret = ERR_PTR(-ENOMEM);
+ +              goto free_blkcg;
+ +      }
+ +
+ +      for (i = 0; i < BLKCG_MAX_POLS ; i++) {
+ +              struct blkcg_policy *pol = blkcg_policy[i];
+ +              struct blkcg_policy_data *cpd;
+ +
+ +              /*
+ +               * If the policy hasn't been attached yet, wait for it
+ +               * to be attached before doing anything else. Otherwise,
+ +               * check if the policy requires any specific per-cgroup
+ +               * data: if it does, allocate and initialize it.
+ +               */
+ +              if (!pol || !pol->cpd_size)
+ +                      continue;
+ +
+ +              BUG_ON(blkcg->pd[i]);
+ +              cpd = kzalloc(pol->cpd_size, GFP_KERNEL);
+ +              if (!cpd) {
+ +                      ret = ERR_PTR(-ENOMEM);
+ +                      goto free_pd_blkcg;
+ +              }
+ +              blkcg->pd[i] = cpd;
+ +              cpd->plid = i;
+ +              pol->cpd_init_fn(blkcg);
+ +      }
   
- -      blkcg->cfq_weight = CFQ_WEIGHT_DEFAULT;
- -      blkcg->cfq_leaf_weight = CFQ_WEIGHT_DEFAULT;
   done:
         spin_lock_init(&blkcg->lock);
         INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
         INIT_HLIST_HEAD(&blkcg->blkg_list);
- 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&blkcg->cgwb_list);
+ #endif
         return &blkcg->css;
+ +
+ +free_pd_blkcg:
+ +      for (i--; i >= 0; i--)
+ +              kfree(blkcg->pd[i]);
+ +
+ +free_blkcg:
+ +      kfree(blkcg);
+ +      return ret;
   }
   
   /**
@@@ -892,9 -861,45 +898,45 @@@
    */
   int blkcg_init_queue(struct request_queue *q)
   {
-       might_sleep();
+       struct blkcg_gq *new_blkg, *blkg;
+       bool preloaded;
+       int ret;
+ 
+       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+       if (!new_blkg)
+               return -ENOMEM;
+ 
+       preloaded = !radix_tree_preload(GFP_KERNEL);
+ 
+       /*
+        * Make sure the root blkg exists and count the existing blkgs.  As
+        * @q is bypassing at this point, blkg_lookup_create() can't be
+        * used.  Open code insertion.
+        */
+       rcu_read_lock();
+       spin_lock_irq(q->queue_lock);
+       blkg = blkg_create(&blkcg_root, q, new_blkg);
+       spin_unlock_irq(q->queue_lock);
+       rcu_read_unlock();
+ 
+       if (preloaded)
+               radix_tree_preload_end();
+ 
+       if (IS_ERR(blkg)) {
+               kfree(new_blkg);
+               return PTR_ERR(blkg);
+       }
+ 
+       q->root_blkg = blkg;
+       q->root_rl.blkg = blkg;
   
-       return blk_throtl_init(q);
+       ret = blk_throtl_init(q);
+       if (ret) {
+               spin_lock_irq(q->queue_lock);
+               blkg_destroy_all(q);
+               spin_unlock_irq(q->queue_lock);
+       }
+       return ret;
   }
   
   /**
@@@ -995,57 -1000,20 +1037,26 @@@ int blkcg_activate_policy(struct reques
                           const struct blkcg_policy *pol)
   {
         LIST_HEAD(pds);
-       struct blkcg_gq *blkg, *new_blkg;
+ +      LIST_HEAD(cpds);
- -      struct blkg_policy_data *pd, *n;
+       struct blkcg_gq *blkg;
+ +      struct blkg_policy_data *pd, *nd;
+ +      struct blkcg_policy_data *cpd, *cnd;
         int cnt = 0, ret;
-       bool preloaded;
   
         if (blkcg_policy_enabled(q, pol))
                 return 0;
   
-       /* preallocations for root blkg */
-       new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
-       if (!new_blkg)
-               return -ENOMEM;
- 
+       /* count and allocate policy_data for all existing blkgs */
         blk_queue_bypass_start(q);
- 
-       preloaded = !radix_tree_preload(GFP_KERNEL);
- 
-       /*
-        * Make sure the root blkg exists and count the existing blkgs.  As
-        * @q is bypassing at this point, blkg_lookup_create() can't be
-        * used.  Open code it.
-        */
         spin_lock_irq(q->queue_lock);
- 
-       rcu_read_lock();
-       blkg = __blkg_lookup(&blkcg_root, q, false);
-       if (blkg)
-               blkg_free(new_blkg);
-       else
-               blkg = blkg_create(&blkcg_root, q, new_blkg);
-       rcu_read_unlock();
- 
-       if (preloaded)
-               radix_tree_preload_end();
- 
-       if (IS_ERR(blkg)) {
-               ret = PTR_ERR(blkg);
-               goto out_unlock;
-       }
- 
         list_for_each_entry(blkg, &q->blkg_list, q_node)
                 cnt++;
- 
         spin_unlock_irq(q->queue_lock);
   
+ +      /*
+ +       * Allocate per-blkg and per-blkcg policy data
+ +       * for all existing blkgs.
+ +       */
         while (cnt--) {
                 pd = kzalloc_node(pol->pd_size, GFP_KERNEL, q->node);
                 if (!pd) {
@@@ -1053,50 -1021,26 +1064,50 @@@
                         goto out_free;
                 }
                 list_add_tail(&pd->alloc_node, &pds);
+ +
+ +              if (!pol->cpd_size)
+ +                      continue;
+ +              cpd = kzalloc_node(pol->cpd_size, GFP_KERNEL, q->node);
+ +              if (!cpd) {
+ +                      ret = -ENOMEM;
+ +                      goto out_free;
+ +              }
+ +              list_add_tail(&cpd->alloc_node, &cpds);
         }
   
         /*
- -       * Install the allocated pds.  With @q bypassing, no new blkg
+ +       * Install the allocated pds and cpds. With @q bypassing, no new blkg
          * should have been created while the queue lock was dropped.
          */
         spin_lock_irq(q->queue_lock);
   
         list_for_each_entry(blkg, &q->blkg_list, q_node) {
- -              if (WARN_ON(list_empty(&pds))) {
+ +              if (WARN_ON(list_empty(&pds)) ||
+ +                  WARN_ON(pol->cpd_size && list_empty(&cpds))) {
                         /* umm... this shouldn't happen, just abort */
                         ret = -ENOMEM;
                         goto out_unlock;
                 }
+ +              cpd = list_first_entry(&cpds, struct blkcg_policy_data,
+ +                                     alloc_node);
+ +              list_del_init(&cpd->alloc_node);
                 pd = list_first_entry(&pds, struct blkg_policy_data, alloc_node);
                 list_del_init(&pd->alloc_node);
   
                 /* grab blkcg lock too while installing @pd on @blkg */
                 spin_lock(&blkg->blkcg->lock);
   
+ +              if (!pol->cpd_size)
+ +                      goto no_cpd;
+ +              if (!blkg->blkcg->pd[pol->plid]) {
+ +                      /* Per-policy per-blkcg data */
+ +                      blkg->blkcg->pd[pol->plid] = cpd;
+ +                      cpd->plid = pol->plid;
+ +                      pol->cpd_init_fn(blkg->blkcg);
+ +              } else { /* must free it as it has already been extracted */
+ +                      kfree(cpd);
+ +              }
+ +no_cpd:
                 blkg->pd[pol->plid] = pd;
                 pd->blkg = blkg;
                 pd->plid = pol->plid;
@@@ -1111,10 -1055,8 +1122,10 @@@ out_unlock
         spin_unlock_irq(q->queue_lock);
   out_free:
         blk_queue_bypass_end(q);
- -      list_for_each_entry_safe(pd, n, &pds, alloc_node)
+ +      list_for_each_entry_safe(pd, nd, &pds, alloc_node)
                 kfree(pd);
+ +      list_for_each_entry_safe(cpd, cnd, &cpds, alloc_node)
+ +              kfree(cpd);
         return ret;
   }
   EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@@ -1140,10 -1082,6 +1151,6 @@@ void blkcg_deactivate_policy(struct req
   
         __clear_bit(pol->plid, q->blkcg_pols);
   
-       /* if no policy is left, no need for blkgs - shoot them down */
-       if (bitmap_empty(q->blkcg_pols, BLKCG_MAX_POLS))
-               blkg_destroy_all(q);
- 
         list_for_each_entry(blkg, &q->blkg_list, q_node) {
                 /* grab blkcg lock too while removing @pd from @blkg */
                 spin_lock(&blkg->blkcg->lock);
@@@ -1155,8 -1093,6 +1162,8 @@@
   
                 kfree(blkg->pd[pol->plid]);
                 blkg->pd[pol->plid] = NULL;
+ +              kfree(blkg->blkcg->pd[pol->plid]);
+ +              blkg->blkcg->pd[pol->plid] = NULL;
   
                 spin_unlock(&blkg->blkcg->lock);
         }
diff --combined block/blk-core.c

index f6ab750060fe019f97d0ccfbca367b9e6cd3b426,a4a2dbe46fe30df73de26d8b7f11f485220a3ce7..688ae9482cb8eab438d3bbcaf6d61602a366cfc8
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -32,12 -32,12 +32,12 @@@
   #include <linux/delay.h>
   #include <linux/ratelimit.h>
   #include <linux/pm_runtime.h>
+ #include <linux/blk-cgroup.h>
   
   #define CREATE_TRACE_POINTS
   #include <trace/events/block.h>
   
   #include "blk.h"
- #include "blk-cgroup.h"
   #include "blk-mq.h"
   
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
@@@ -63,6 -63,31 +63,31 @@@ struct kmem_cache *blk_requestq_cachep
    */
   static struct workqueue_struct *kblockd_workqueue;
   
+ static void blk_clear_congested(struct request_list *rl, int sync)
+ {
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       clear_wb_congested(rl->blkg->wb_congested, sync);
+ #else
+       /*
+        * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't
+        * flip its congestion state for events on other blkcgs.
+        */
+       if (rl == &rl->q->root_rl)
+               clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+ #endif
+ }
+ 
+ static void blk_set_congested(struct request_list *rl, int sync)
+ {
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       set_wb_congested(rl->blkg->wb_congested, sync);
+ #else
+       /* see blk_clear_congested() */
+       if (rl == &rl->q->root_rl)
+               set_wb_congested(rl->q->backing_dev_info.wb.congested, sync);
+ #endif
+ }
+ 
   void blk_queue_congestion_threshold(struct request_queue *q)
   {
         int nr;
@@@ -554,8 -579,6 +579,8 @@@ void blk_cleanup_queue(struct request_q
                 q->queue_lock = &q->__queue_lock;
         spin_unlock_irq(lock);
   
+ +      bdi_destroy(&q->backing_dev_info);
+ +
         /* @q is and will stay empty, shutdown and put */
         blk_put_queue(q);
   }
@@@ -623,8 -646,7 +648,7 @@@ struct request_queue *blk_alloc_queue_n
   
         q->backing_dev_info.ra_pages =
                         (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
-       q->backing_dev_info.state = 0;
-       q->backing_dev_info.capabilities = 0;
+       q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK;
         q->backing_dev_info.name = "block";
         q->node = node_id;
   
@@@ -736,8 -758,6 +760,8 @@@ blk_init_queue_node(request_fn_proc *rf
   }
   EXPORT_SYMBOL(blk_init_queue_node);
   
+ +static void blk_queue_bio(struct request_queue *q, struct bio *bio);
+ +
   struct request_queue *
   blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
                          spinlock_t *lock)
@@@ -847,13 -867,8 +871,8 @@@ static void __freed_request(struct requ
   {
         struct request_queue *q = rl->q;
   
-       /*
-        * bdi isn't aware of blkcg yet.  As all async IOs end up root
-        * blkcg anyway, just use root blkcg state.
-        */
-       if (rl == &q->root_rl &&
-           rl->count[sync] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, sync);
+       if (rl->count[sync] < queue_congestion_off_threshold(q))
+               blk_clear_congested(rl, sync);
   
         if (rl->count[sync] + 1 <= q->nr_requests) {
                 if (waitqueue_active(&rl->wait[sync]))
@@@ -886,25 -901,25 +905,25 @@@ static void freed_request(struct reques
   int blk_update_nr_requests(struct request_queue *q, unsigned int nr)
   {
         struct request_list *rl;
+       int on_thresh, off_thresh;
   
         spin_lock_irq(q->queue_lock);
         q->nr_requests = nr;
         blk_queue_congestion_threshold(q);
+       on_thresh = queue_congestion_on_threshold(q);
+       off_thresh = queue_congestion_off_threshold(q);
   
-       /* congestion isn't cgroup aware and follows root blkcg for now */
-       rl = &q->root_rl;
- 
-       if (rl->count[BLK_RW_SYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_SYNC);
-       else if (rl->count[BLK_RW_SYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_SYNC);
+       blk_queue_for_each_rl(rl, q) {
+               if (rl->count[BLK_RW_SYNC] >= on_thresh)
+                       blk_set_congested(rl, BLK_RW_SYNC);
+               else if (rl->count[BLK_RW_SYNC] < off_thresh)
+                       blk_clear_congested(rl, BLK_RW_SYNC);
   
-       if (rl->count[BLK_RW_ASYNC] >= queue_congestion_on_threshold(q))
-               blk_set_queue_congested(q, BLK_RW_ASYNC);
-       else if (rl->count[BLK_RW_ASYNC] < queue_congestion_off_threshold(q))
-               blk_clear_queue_congested(q, BLK_RW_ASYNC);
+               if (rl->count[BLK_RW_ASYNC] >= on_thresh)
+                       blk_set_congested(rl, BLK_RW_ASYNC);
+               else if (rl->count[BLK_RW_ASYNC] < off_thresh)
+                       blk_clear_congested(rl, BLK_RW_ASYNC);
   
-       blk_queue_for_each_rl(rl, q) {
                 if (rl->count[BLK_RW_SYNC] >= q->nr_requests) {
                         blk_set_rl_full(rl, BLK_RW_SYNC);
                 } else {
@@@ -1014,12 -1029,7 +1033,7 @@@ static struct request *__get_request(st
                                 }
                         }
                 }
-               /*
-                * bdi isn't aware of blkcg yet.  As all async IOs end up
-                * root blkcg anyway, just use root blkcg state.
-                */
-               if (rl == &q->root_rl)
-                       blk_set_queue_congested(q, is_sync);
+               blk_set_congested(rl, is_sync);
         }
   
         /*
@@@ -1591,7 -1601,7 +1605,7 @@@ void init_request_from_bio(struct reque
         blk_rq_bio_prep(req->q, req, bio);
   }
   
- -void blk_queue_bio(struct request_queue *q, struct bio *bio)
+ +static void blk_queue_bio(struct request_queue *q, struct bio *bio)
   {
         const bool sync = !!(bio->bi_rw & REQ_SYNC);
         struct blk_plug *plug;
@@@ -1699,6 -1709,7 +1713,6 @@@ out_unlock
                 spin_unlock_irq(q->queue_lock);
         }
   }
- -EXPORT_SYMBOL_GPL(blk_queue_bio);     /* for device mapper only */
   
   /*
    * If bio->bi_dev is a partition, remap the location
diff --combined block/blk-sysfs.c

index 2b8fd302f677a967d87994f8a7532aab8dfe6569,1b60941dc4c65c45e05450e3a162b341b33aeec3..6264b382d4d1ba8765dc3b22cead4fd9bf384d99
--- 1/block/blk-sysfs.c
--- 2/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@@ -6,11 -6,12 +6,12 @@@
   #include <linux/module.h>
   #include <linux/bio.h>
   #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
   #include <linux/blktrace_api.h>
   #include <linux/blk-mq.h>
+ #include <linux/blk-cgroup.h>
   
   #include "blk.h"
- #include "blk-cgroup.h"
   #include "blk-mq.h"
   
   struct queue_sysfs_entry {
@@@ -522,6 -523,8 +523,6 @@@ static void blk_release_queue(struct ko
   
         blk_trace_shutdown(q);
   
- -      bdi_destroy(&q->backing_dev_info);
- -
         ida_simple_remove(&blk_queue_ida, q->id);
         call_rcu(&q->rcu_head, blk_free_queue_rcu);
   }
diff --combined block/bounce.c

index 3ab0bce1c947ef9be81f09139aa73d9bd4b76ff5,072280b3dd138e7cecf555c914fbb99c0589d618..b17311227c12764f18760ee4ce71fa828f939f45
--- 1/block/bounce.c
--- 2/block/bounce.c
+++ b/block/bounce.c
@@@ -13,6 -13,7 +13,7 @@@
   #include <linux/pagemap.h>
   #include <linux/mempool.h>
   #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
   #include <linux/init.h>
   #include <linux/hash.h>
   #include <linux/highmem.h>
@@@ -218,8 -219,8 +219,8 @@@ bounce
                 if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
                         continue;
   
- -              inc_zone_page_state(to->bv_page, NR_BOUNCE);
                 to->bv_page = mempool_alloc(pool, q->bounce_gfp);
+ +              inc_zone_page_state(to->bv_page, NR_BOUNCE);
   
                 if (rw == WRITE) {
                         char *vto, *vfrom;
diff --combined block/cfq-iosched.c

index d8ad45ccd8fa784a60dac66d91158eb0c9065b27,bc8f429307736988d036b3d50a95ca964cc6650a..c62bb2e650b8c741e64ead5c9f32b090cbf19730
--- 1/block/cfq-iosched.c
--- 2/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@@ -14,8 -14,8 +14,8 @@@
   #include <linux/rbtree.h>
   #include <linux/ioprio.h>
   #include <linux/blktrace_api.h>
+ #include <linux/blk-cgroup.h>
   #include "blk.h"
- #include "blk-cgroup.h"
   
   /*
    * tunables
@@@ -67,11 -67,6 +67,11 @@@ static struct kmem_cache *cfq_pool
   #define sample_valid(samples) ((samples) > 80)
   #define rb_entry_cfqg(node)   rb_entry((node), struct cfq_group, rb_node)
   
+ +/* blkio-related constants */
+ +#define CFQ_WEIGHT_MIN          10
+ +#define CFQ_WEIGHT_MAX          1000
+ +#define CFQ_WEIGHT_DEFAULT      500
+ +
   struct cfq_ttime {
         unsigned long last_end_request;
   
@@@ -217,15 -212,6 +217,15 @@@ struct cfqg_stats 
   #endif        /* CONFIG_CFQ_GROUP_IOSCHED */
   };
   
+ +/* Per-cgroup data */
+ +struct cfq_group_data {
+ +      /* must be the first member */
+ +      struct blkcg_policy_data pd;
+ +
+ +      unsigned int weight;
+ +      unsigned int leaf_weight;
+ +};
+ +
   /* This is per cgroup per device grouping structure */
   struct cfq_group {
         /* must be the first member */
@@@ -460,6 -446,16 +460,6 @@@ CFQ_CFQQ_FNS(deep)
   CFQ_CFQQ_FNS(wait_busy);
   #undef CFQ_CFQQ_FNS
   
- -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
- -{
- -      return pd ? container_of(pd, struct cfq_group, pd) : NULL;
- -}
- -
- -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
- -{
- -      return pd_to_blkg(&cfqg->pd);
- -}
- -
   #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
   
   /* cfqg stats flags */
@@@ -604,22 -600,6 +604,22 @@@ static inline void cfqg_stats_update_av
   
   #ifdef CONFIG_CFQ_GROUP_IOSCHED
   
+ +static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
+ +{
+ +      return pd ? container_of(pd, struct cfq_group, pd) : NULL;
+ +}
+ +
+ +static struct cfq_group_data
+ +*cpd_to_cfqgd(struct blkcg_policy_data *cpd)
+ +{
+ +      return cpd ? container_of(cpd, struct cfq_group_data, pd) : NULL;
+ +}
+ +
+ +static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg)
+ +{
+ +      return pd_to_blkg(&cfqg->pd);
+ +}
+ +
   static struct blkcg_policy blkcg_policy_cfq;
   
   static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg)
@@@ -627,11 -607,6 +627,11 @@@
         return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq));
   }
   
+ +static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg)
+ +{
+ +      return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq));
+ +}
+ +
   static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg)
   {
         struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent;
@@@ -1569,28 -1544,13 +1569,28 @@@ static void cfqg_stats_init(struct cfqg
   #endif
   }
   
+ +static void cfq_cpd_init(const struct blkcg *blkcg)
+ +{
+ +      struct cfq_group_data *cgd =
+ +              cpd_to_cfqgd(blkcg->pd[blkcg_policy_cfq.plid]);
+ +
+ +      if (blkcg == &blkcg_root) {
+ +              cgd->weight = 2 * CFQ_WEIGHT_DEFAULT;
+ +              cgd->leaf_weight = 2 * CFQ_WEIGHT_DEFAULT;
+ +      } else {
+ +              cgd->weight = CFQ_WEIGHT_DEFAULT;
+ +              cgd->leaf_weight = CFQ_WEIGHT_DEFAULT;
+ +      }
+ +}
+ +
   static void cfq_pd_init(struct blkcg_gq *blkg)
   {
         struct cfq_group *cfqg = blkg_to_cfqg(blkg);
+ +      struct cfq_group_data *cgd = blkcg_to_cfqgd(blkg->blkcg);
   
         cfq_init_cfqg_base(cfqg);
- -      cfqg->weight = blkg->blkcg->cfq_weight;
- -      cfqg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+ +      cfqg->weight = cgd->weight;
+ +      cfqg->leaf_weight = cgd->leaf_weight;
         cfqg_stats_init(&cfqg->stats);
         cfqg_stats_init(&cfqg->dead_stats);
   }
@@@ -1713,27 -1673,13 +1713,27 @@@ static int cfqg_print_leaf_weight_devic
   
   static int cfq_print_weight(struct seq_file *sf, void *v)
   {
- -      seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_weight);
+ +      struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ +      struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+ +      unsigned int val = 0;
+ +
+ +      if (cgd)
+ +              val = cgd->weight;
+ +
+ +      seq_printf(sf, "%u\n", val);
         return 0;
   }
   
   static int cfq_print_leaf_weight(struct seq_file *sf, void *v)
   {
- -      seq_printf(sf, "%u\n", css_to_blkcg(seq_css(sf))->cfq_leaf_weight);
+ +      struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ +      struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+ +      unsigned int val = 0;
+ +
+ +      if (cgd)
+ +              val = cgd->leaf_weight;
+ +
+ +      seq_printf(sf, "%u\n", val);
         return 0;
   }
   
@@@ -1744,7 -1690,6 +1744,7 @@@ static ssize_t __cfqg_set_weight_device
         struct blkcg *blkcg = css_to_blkcg(of_css(of));
         struct blkg_conf_ctx ctx;
         struct cfq_group *cfqg;
+ +      struct cfq_group_data *cfqgd;
         int ret;
   
         ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx);
@@@ -1753,22 -1698,17 +1753,22 @@@
   
         ret = -EINVAL;
         cfqg = blkg_to_cfqg(ctx.blkg);
+ +      cfqgd = blkcg_to_cfqgd(blkcg);
+ +      if (!cfqg || !cfqgd)
+ +              goto err;
+ +
         if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
                 if (!is_leaf_weight) {
                         cfqg->dev_weight = ctx.v;
- -                      cfqg->new_weight = ctx.v ?: blkcg->cfq_weight;
+ +                      cfqg->new_weight = ctx.v ?: cfqgd->weight;
                 } else {
                         cfqg->dev_leaf_weight = ctx.v;
- -                      cfqg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+ +                      cfqg->new_leaf_weight = ctx.v ?: cfqgd->leaf_weight;
                 }
                 ret = 0;
         }
   
+ +err:
         blkg_conf_finish(&ctx);
         return ret ?: nbytes;
   }
@@@ -1790,23 -1730,16 +1790,23 @@@ static int __cfq_set_weight(struct cgro
   {
         struct blkcg *blkcg = css_to_blkcg(css);
         struct blkcg_gq *blkg;
+ +      struct cfq_group_data *cfqgd;
+ +      int ret = 0;
   
         if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
                 return -EINVAL;
   
         spin_lock_irq(&blkcg->lock);
+ +      cfqgd = blkcg_to_cfqgd(blkcg);
+ +      if (!cfqgd) {
+ +              ret = -EINVAL;
+ +              goto out;
+ +      }
   
         if (!is_leaf_weight)
- -              blkcg->cfq_weight = val;
+ +              cfqgd->weight = val;
         else
- -              blkcg->cfq_leaf_weight = val;
+ +              cfqgd->leaf_weight = val;
   
         hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
                 struct cfq_group *cfqg = blkg_to_cfqg(blkg);
@@@ -1816,16 -1749,15 +1816,16 @@@
   
                 if (!is_leaf_weight) {
                         if (!cfqg->dev_weight)
- -                              cfqg->new_weight = blkcg->cfq_weight;
+ +                              cfqg->new_weight = cfqgd->weight;
                 } else {
                         if (!cfqg->dev_leaf_weight)
- -                              cfqg->new_leaf_weight = blkcg->cfq_leaf_weight;
+ +                              cfqg->new_leaf_weight = cfqgd->leaf_weight;
                 }
         }
   
+ +out:
         spin_unlock_irq(&blkcg->lock);
- -      return 0;
+ +      return ret;
   }
   
   static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft,
@@@ -4545,18 -4477,6 +4545,18 @@@ out_free
         return ret;
   }
   
+ +static void cfq_registered_queue(struct request_queue *q)
+ +{
+ +      struct elevator_queue *e = q->elevator;
+ +      struct cfq_data *cfqd = e->elevator_data;
+ +
+ +      /*
+ +       * Default to IOPS mode with no idling for SSDs
+ +       */
+ +      if (blk_queue_nonrot(q))
+ +              cfqd->cfq_slice_idle = 0;
+ +}
+ +
   /*
    * sysfs parts below -->
    */
@@@ -4672,7 -4592,6 +4672,7 @@@ static struct elevator_type iosched_cf
                 .elevator_may_queue_fn =        cfq_may_queue,
                 .elevator_init_fn =             cfq_init_queue,
                 .elevator_exit_fn =             cfq_exit_queue,
+ +              .elevator_registered_fn =       cfq_registered_queue,
         },
         .icq_size       =       sizeof(struct cfq_io_cq),
         .icq_align      =       __alignof__(struct cfq_io_cq),
@@@ -4684,10 -4603,8 +4684,10 @@@
   #ifdef CONFIG_CFQ_GROUP_IOSCHED
   static struct blkcg_policy blkcg_policy_cfq = {
         .pd_size                = sizeof(struct cfq_group),
+ +      .cpd_size               = sizeof(struct cfq_group_data),
         .cftypes                = cfq_blkcg_files,
   
+ +      .cpd_init_fn            = cfq_cpd_init,
         .pd_init_fn             = cfq_pd_init,
         .pd_offline_fn          = cfq_pd_offline,
         .pd_reset_stats_fn      = cfq_pd_reset_stats,
diff --combined block/elevator.c

index 942579d04128b5484f2d3e53bf38b4994ef852ee,3bbb48f430e40e4fc022d13e1f469e09377f662e..84d63943f2de2f386ff35e6a395f68ada173b5b2
--- 1/block/elevator.c
--- 2/block/elevator.c
+++ b/block/elevator.c
@@@ -35,11 -35,11 +35,11 @@@
   #include <linux/hash.h>
   #include <linux/uaccess.h>
   #include <linux/pm_runtime.h>
+ #include <linux/blk-cgroup.h>
   
   #include <trace/events/block.h>
   
   #include "blk.h"
- #include "blk-cgroup.h"
   
   static DEFINE_SPINLOCK(elv_list_lock);
   static LIST_HEAD(elv_list);
@@@ -157,7 -157,7 +157,7 @@@ struct elevator_queue *elevator_alloc(s
   
         eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node);
         if (unlikely(!eq))
- -              goto err;
+ +              return NULL;
   
         eq->type = e;
         kobject_init(&eq->kobj, &elv_ktype);
@@@ -165,6 -165,10 +165,6 @@@
         hash_init(eq->hash);
   
         return eq;
- -err:
- -      kfree(eq);
- -      elevator_put(e);
- -      return NULL;
   }
   EXPORT_SYMBOL(elevator_alloc);
   
@@@ -806,8 -810,6 +806,8 @@@ int elv_register_queue(struct request_q
                 }
                 kobject_uevent(&e->kobj, KOBJ_ADD);
                 e->registered = 1;
+ +              if (e->type->ops.elevator_registered_fn)
+ +                      e->type->ops.elevator_registered_fn(q);
         }
         return error;
   }
diff --combined block/genhd.c

index ea982eadaf6380b974d6b1d39a7197085217ac91,d46ba566d62faeffc2b95089c0e684699c9257e4..59a1395eedac45e3e5d6326ed2956caf8a7de7c8
--- 1/block/genhd.c
--- 2/block/genhd.c
+++ b/block/genhd.c
@@@ -8,6 -8,7 +8,7 @@@
   #include <linux/kdev_t.h>
   #include <linux/kernel.h>
   #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
   #include <linux/init.h>
   #include <linux/spinlock.h>
   #include <linux/proc_fs.h>
@@@ -422,9 -423,9 +423,9 @@@ int blk_alloc_devt(struct hd_struct *pa
         /* allocate ext devt */
         idr_preload(GFP_KERNEL);
   
- -      spin_lock(&ext_devt_lock);
+ +      spin_lock_bh(&ext_devt_lock);
         idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
- -      spin_unlock(&ext_devt_lock);
+ +      spin_unlock_bh(&ext_devt_lock);
   
         idr_preload_end();
         if (idx < 0)
@@@ -449,9 -450,9 +450,9 @@@ void blk_free_devt(dev_t devt
                 return;
   
         if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
- -              spin_lock(&ext_devt_lock);
+ +              spin_lock_bh(&ext_devt_lock);
                 idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
- -              spin_unlock(&ext_devt_lock);
+ +              spin_unlock_bh(&ext_devt_lock);
         }
   }
   
@@@ -653,6 -654,7 +654,6 @@@ void del_gendisk(struct gendisk *disk
         disk->flags &= ~GENHD_FL_UP;
   
         sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
- -      bdi_unregister(&disk->queue->backing_dev_info);
         blk_unregister_queue(disk);
         blk_unregister_region(disk_devt(disk), disk->minors);
   
@@@ -690,13 -692,13 +691,13 @@@ struct gendisk *get_gendisk(dev_t devt
         } else {
                 struct hd_struct *part;
   
- -              spin_lock(&ext_devt_lock);
+ +              spin_lock_bh(&ext_devt_lock);
                 part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
                 if (part && get_disk(part_to_disk(part))) {
                         *partno = part->partno;
                         disk = part_to_disk(part);
                 }
- -              spin_unlock(&ext_devt_lock);
+ +              spin_unlock_bh(&ext_devt_lock);
         }
   
         return disk;
diff --combined drivers/md/dm.c

index 4d6f089a0e9e2eca5b8fa58017a29e1da598c2a0,2161ed9329c41e95969415a23ec76af87a3476b1..d72829922eb6c8a2c81f2892f7c265a5bb0d9f24
--- 1/drivers/md/dm.c
--- 2/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@@ -1031,11 -1031,13 +1031,11 @@@ static void rq_completed(struct mapped_
         dm_put(md);
   }
   
- -static void free_rq_clone(struct request *clone, bool must_be_mapped)
+ +static void free_rq_clone(struct request *clone)
   {
         struct dm_rq_target_io *tio = clone->end_io_data;
         struct mapped_device *md = tio->md;
   
- -      WARN_ON_ONCE(must_be_mapped && !clone->q);
- -
         if (md->type == DM_TYPE_MQ_REQUEST_BASED)
                 /* stacked on blk-mq queue(s) */
                 tio->ti->type->release_clone_rq(clone);
@@@ -1077,7 -1079,7 +1077,7 @@@ static void dm_end_request(struct reque
                         rq->sense_len = clone->sense_len;
         }
   
- -      free_rq_clone(clone, true);
+ +      free_rq_clone(clone);
         if (!rq->q->mq_ops)
                 blk_end_request_all(rq, error);
         else
@@@ -1096,7 -1098,7 +1096,7 @@@ static void dm_unprep_request(struct re
         }
   
         if (clone)
- -              free_rq_clone(clone, false);
+ +              free_rq_clone(clone);
   }
   
   /*
@@@ -1109,7 -1111,6 +1109,7 @@@ static void old_requeue_request(struct 
   
         spin_lock_irqsave(q->queue_lock, flags);
         blk_requeue_request(q, rq);
+ +      blk_run_queue_async(q);
         spin_unlock_irqrestore(q->queue_lock, flags);
   }
   
@@@ -1670,7 -1671,8 +1670,7 @@@ static int dm_merge_bvec(struct request
         struct mapped_device *md = q->queuedata;
         struct dm_table *map = dm_get_live_table_fast(md);
         struct dm_target *ti;
- -      sector_t max_sectors;
- -      int max_size = 0;
+ +      sector_t max_sectors, max_size = 0;
   
         if (unlikely(!map))
                 goto out;
@@@ -1685,16 -1687,8 +1685,16 @@@
         max_sectors = min(max_io_len(bvm->bi_sector, ti),
                           (sector_t) queue_max_sectors(q));
         max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
- -      if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
- -              max_size = 0;
+ +
+ +      /*
+ +       * FIXME: this stop-gap fix _must_ be cleaned up (by passing a sector_t
+ +       * to the targets' merge function since it holds sectors not bytes).
+ +       * Just doing this as an interim fix for stable@ because the more
+ +       * comprehensive cleanup of switching to sector_t will impact every
+ +       * DM target that implements a ->merge hook.
+ +       */
+ +      if (max_size > INT_MAX)
+ +              max_size = INT_MAX;
   
         /*
          * merge_bvec_fn() returns number of bytes
@@@ -1702,7 -1696,7 +1702,7 @@@
          * max is precomputed maximal io size
          */
         if (max_size && ti->type->merge)
- -              max_size = ti->type->merge(ti, bvm, biovec, max_size);
+ +              max_size = ti->type->merge(ti, bvm, biovec, (int) max_size);
         /*
          * If the target doesn't support merge method and some of the devices
          * provided their merge_bvec method (we know this by looking for the
@@@ -1893,8 -1887,8 +1893,8 @@@ static int map_request(struct dm_rq_tar
                         dm_kill_unmapped_request(rq, r);
                         return r;
                 }
- -              if (IS_ERR(clone))
- -                      return DM_MAPIO_REQUEUE;
+ +              if (r != DM_MAPIO_REMAPPED)
+ +                      return r;
                 setup_clone(clone, rq, tio);
         }
   
@@@ -2080,7 -2074,7 +2080,7 @@@ static int dm_any_congested(void *conge
                          * the query about congestion status of request_queue
                          */
                         if (dm_request_based(md))
-                               r = md->queue->backing_dev_info.state &
+                               r = md->queue->backing_dev_info.wb.state &
                                     bdi_bits;
                         else
                                 r = dm_table_any_congested(map, bdi_bits);
@@@ -2669,15 -2663,13 +2669,15 @@@ static int dm_mq_queue_rq(struct blk_mq
         if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
                 /* clone request is allocated at the end of the pdu */
                 tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
- -              if (!clone_rq(rq, md, tio, GFP_ATOMIC))
- -                      return BLK_MQ_RQ_QUEUE_BUSY;
+ +              (void) clone_rq(rq, md, tio, GFP_ATOMIC);
                 queue_kthread_work(&md->kworker, &tio->work);
         } else {
                 /* Direct call is fine since .queue_rq allows allocations */
- -              if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)
- -                      dm_requeue_unmapped_original_request(md, rq);
+ +              if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+ +                      /* Undo dm_start_request() before requeuing */
+ +                      rq_completed(md, rq_data_dir(rq), false);
+ +                      return BLK_MQ_RQ_QUEUE_BUSY;
+ +              }
         }
   
         return BLK_MQ_RQ_QUEUE_OK;
diff --combined drivers/md/raid10.c

index f55c3f35b7463141086afb727785c775c5185d76,fca825718f29a2ba6d5d39789ba6593699b3aec9..188d8e9a6bdcc39e4da54095466f45683d6b2177
--- 1/drivers/md/raid10.c
--- 2/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@@ -914,7 -914,7 +914,7 @@@ static int raid10_congested(struct mdde
         struct r10conf *conf = mddev->private;
         int i, ret = 0;
   
-       if ((bits & (1 << BDI_async_congested)) &&
+       if ((bits & (1 << WB_async_congested)) &&
             conf->pending_count >= max_queued_requests)
                 return 1;
   
@@@ -4156,7 -4156,6 +4156,7 @@@ static int raid10_start_reshape(struct 
   
         clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
         clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ +      clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
         set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
         set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
   
diff --combined fs/ext4/extents.c

index d86d2622f82631f3fa46abe1047caf319b968402,e8b5866ffa07f838c82778757fee69fc81bc097f..aadb7282883493597f8dae099f20c3e86694bea8
--- 1/fs/ext4/extents.c
--- 2/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@@ -39,6 -39,7 +39,7 @@@
   #include <linux/slab.h>
   #include <asm/uaccess.h>
   #include <linux/fiemap.h>
+ #include <linux/backing-dev.h>
   #include "ext4_jbd2.h"
   #include "ext4_extents.h"
   #include "xattr.h"
@@@ -377,7 -378,7 +378,7 @@@ static int ext4_valid_extent(struct ino
         ext4_lblk_t lblock = le32_to_cpu(ext->ee_block);
         ext4_lblk_t last = lblock + len - 1;
   
- -      if (lblock > last)
+ +      if (len == 0 || lblock > last)
                 return 0;
         return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
   }
@@@ -4456,8 -4457,6 +4457,8 @@@ int ext4_ext_map_blocks(handle_t *handl
                 ar.flags |= EXT4_MB_HINT_NOPREALLOC;
         if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                 ar.flags |= EXT4_MB_DELALLOC_RESERVED;
+ +      if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL)
+ +              ar.flags |= EXT4_MB_USE_RESERVED;
         newblock = ext4_mb_new_blocks(handle, &ar, &err);
         if (!newblock)
                 goto out2;
@@@ -4665,7 -4664,6 +4666,7 @@@ static int ext4_alloc_file_blocks(struc
         int ret = 0;
         int ret2 = 0;
         int retries = 0;
+ +      int depth = 0;
         struct ext4_map_blocks map;
         unsigned int credits;
         loff_t epos;
@@@ -4680,32 -4678,13 +4681,32 @@@
         if (len <= EXT_UNWRITTEN_MAX_LEN)
                 flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
   
+ +      /* Wait all existing dio workers, newcomers will block on i_mutex */
+ +      ext4_inode_block_unlocked_dio(inode);
+ +      inode_dio_wait(inode);
+ +
         /*
          * credits to insert 1 extent into extent tree
          */
         credits = ext4_chunk_trans_blocks(inode, len);
+ +      /*
+ +       * We can only call ext_depth() on extent based inodes
+ +       */
+ +      if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ +              depth = ext_depth(inode);
+ +      else
+ +              depth = -1;
   
   retry:
         while (ret >= 0 && len) {
+ +              /*
+ +               * Recalculate credits when extent tree depth changes.
+ +               */
+ +              if (depth >= 0 && depth != ext_depth(inode)) {
+ +                      credits = ext4_chunk_trans_blocks(inode, len);
+ +                      depth = ext_depth(inode);
+ +              }
+ +
                 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
                                             credits);
                 if (IS_ERR(handle)) {
@@@ -4747,8 -4726,6 +4748,8 @@@
                 goto retry;
         }
   
+ +      ext4_inode_resume_unlocked_dio(inode);
+ +
         return ret > 0 ? ret2 : ret;
   }
   
@@@ -4936,14 -4913,12 +4937,14 @@@ long ext4_fallocate(struct file *file, 
          * bug we should fix....
          */
         if (ext4_encrypted_inode(inode) &&
- -          (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)))
+ +          (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE |
+ +                   FALLOC_FL_ZERO_RANGE)))
                 return -EOPNOTSUPP;
   
         /* Return error if mode is not supported */
         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
- -                   FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+ +                   FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+ +                   FALLOC_FL_INSERT_RANGE))
                 return -EOPNOTSUPP;
   
         if (mode & FALLOC_FL_PUNCH_HOLE)
@@@ -4956,9 -4931,6 +4957,9 @@@
         if (mode & FALLOC_FL_COLLAPSE_RANGE)
                 return ext4_collapse_range(inode, offset, len);
   
+ +      if (mode & FALLOC_FL_INSERT_RANGE)
+ +              return ext4_insert_range(inode, offset, len);
+ +
         if (mode & FALLOC_FL_ZERO_RANGE)
                 return ext4_zero_range(file, offset, len, mode);
   
@@@ -5253,13 -5225,13 +5254,13 @@@ ext4_access_path(handle_t *handle, stru
   /*
    * ext4_ext_shift_path_extents:
    * Shift the extents of a path structure lying between path[depth].p_ext
- - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift
- - * from starting block for each extent.
+ + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells
+ + * if it is right shift or left shift operation.
    */
   static int
   ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift,
                             struct inode *inode, handle_t *handle,
- -                          ext4_lblk_t *start)
+ +                          enum SHIFT_DIRECTION SHIFT)
   {
         int depth, err = 0;
         struct ext4_extent *ex_start, *ex_last;
@@@ -5281,25 -5253,19 +5282,25 @@@
                         if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr))
                                 update = 1;
   
- -                      *start = le32_to_cpu(ex_last->ee_block) +
- -                              ext4_ext_get_actual_len(ex_last);
- -
                         while (ex_start <= ex_last) {
- -                              le32_add_cpu(&ex_start->ee_block, -shift);
- -                              /* Try to merge to the left. */
- -                              if ((ex_start >
- -                                   EXT_FIRST_EXTENT(path[depth].p_hdr)) &&
- -                                  ext4_ext_try_to_merge_right(inode,
- -                                                      path, ex_start - 1))
+ +                              if (SHIFT == SHIFT_LEFT) {
+ +                                      le32_add_cpu(&ex_start->ee_block,
+ +                                              -shift);
+ +                                      /* Try to merge to the left. */
+ +                                      if ((ex_start >
+ +                                          EXT_FIRST_EXTENT(path[depth].p_hdr))
+ +                                          &&
+ +                                          ext4_ext_try_to_merge_right(inode,
+ +                                          path, ex_start - 1))
+ +                                              ex_last--;
+ +                                      else
+ +                                              ex_start++;
+ +                              } else {
+ +                                      le32_add_cpu(&ex_last->ee_block, shift);
+ +                                      ext4_ext_try_to_merge_right(inode, path,
+ +                                              ex_last);
                                         ex_last--;
- -                              else
- -                                      ex_start++;
+ +                              }
                         }
                         err = ext4_ext_dirty(handle, inode, path + depth);
                         if (err)
@@@ -5314,10 -5280,7 +5315,10 @@@
                 if (err)
                         goto out;
   
- -              le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ +              if (SHIFT == SHIFT_LEFT)
+ +                      le32_add_cpu(&path[depth].p_idx->ei_block, -shift);
+ +              else
+ +                      le32_add_cpu(&path[depth].p_idx->ei_block, shift);
                 err = ext4_ext_dirty(handle, inode, path + depth);
                 if (err)
                         goto out;
@@@ -5335,20 -5298,19 +5336,20 @@@ out
   
   /*
    * ext4_ext_shift_extents:
- - * All the extents which lies in the range from start to the last allocated
- - * block for the file are shifted downwards by shift blocks.
+ + * All the extents which lies in the range from @start to the last allocated
+ + * block for the @inode are shifted either towards left or right (depending
+ + * upon @SHIFT) by @shift blocks.
    * On success, 0 is returned, error otherwise.
    */
   static int
   ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
- -                     ext4_lblk_t start, ext4_lblk_t shift)
+ +                     ext4_lblk_t start, ext4_lblk_t shift,
+ +                     enum SHIFT_DIRECTION SHIFT)
   {
         struct ext4_ext_path *path;
         int ret = 0, depth;
         struct ext4_extent *extent;
- -      ext4_lblk_t stop_block;
- -      ext4_lblk_t ex_start, ex_end;
+ +      ext4_lblk_t stop, *iterator, ex_start, ex_end;
   
         /* Let path point to the last extent */
         path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0);
@@@ -5360,84 -5322,58 +5361,84 @@@
         if (!extent)
                 goto out;
   
- -      stop_block = le32_to_cpu(extent->ee_block) +
+ +      stop = le32_to_cpu(extent->ee_block) +
                         ext4_ext_get_actual_len(extent);
   
- -      /* Nothing to shift, if hole is at the end of file */
- -      if (start >= stop_block)
- -              goto out;
+ +       /*
+ +       * In case of left shift, Don't start shifting extents until we make
+ +       * sure the hole is big enough to accommodate the shift.
+ +      */
+ +      if (SHIFT == SHIFT_LEFT) {
+ +              path = ext4_find_extent(inode, start - 1, &path, 0);
+ +              if (IS_ERR(path))
+ +                      return PTR_ERR(path);
+ +              depth = path->p_depth;
+ +              extent =  path[depth].p_ext;
+ +              if (extent) {
+ +                      ex_start = le32_to_cpu(extent->ee_block);
+ +                      ex_end = le32_to_cpu(extent->ee_block) +
+ +                              ext4_ext_get_actual_len(extent);
+ +              } else {
+ +                      ex_start = 0;
+ +                      ex_end = 0;
+ +              }
   
- -      /*
- -       * Don't start shifting extents until we make sure the hole is big
- -       * enough to accomodate the shift.
- -       */
- -      path = ext4_find_extent(inode, start - 1, &path, 0);
- -      if (IS_ERR(path))
- -              return PTR_ERR(path);
- -      depth = path->p_depth;
- -      extent =  path[depth].p_ext;
- -      if (extent) {
- -              ex_start = le32_to_cpu(extent->ee_block);
- -              ex_end = le32_to_cpu(extent->ee_block) +
- -                      ext4_ext_get_actual_len(extent);
- -      } else {
- -              ex_start = 0;
- -              ex_end = 0;
+ +              if ((start == ex_start && shift > ex_start) ||
+ +                  (shift > start - ex_end)) {
+ +                      ext4_ext_drop_refs(path);
+ +                      kfree(path);
+ +                      return -EINVAL;
+ +              }
         }
   
- -      if ((start == ex_start && shift > ex_start) ||
- -          (shift > start - ex_end))
- -              return -EINVAL;
+ +      /*
+ +       * In case of left shift, iterator points to start and it is increased
+ +       * till we reach stop. In case of right shift, iterator points to stop
+ +       * and it is decreased till we reach start.
+ +       */
+ +      if (SHIFT == SHIFT_LEFT)
+ +              iterator = &start;
+ +      else
+ +              iterator = &stop;
   
         /* Its safe to start updating extents */
- -      while (start < stop_block) {
- -              path = ext4_find_extent(inode, start, &path, 0);
+ +      while (start < stop) {
+ +              path = ext4_find_extent(inode, *iterator, &path, 0);
                 if (IS_ERR(path))
                         return PTR_ERR(path);
                 depth = path->p_depth;
                 extent = path[depth].p_ext;
                 if (!extent) {
                         EXT4_ERROR_INODE(inode, "unexpected hole at %lu",
- -                                       (unsigned long) start);
+ +                                       (unsigned long) *iterator);
                         return -EIO;
                 }
- -              if (start > le32_to_cpu(extent->ee_block)) {
+ +              if (SHIFT == SHIFT_LEFT && *iterator >
+ +                  le32_to_cpu(extent->ee_block)) {
                         /* Hole, move to the next extent */
                         if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
                                 path[depth].p_ext++;
                         } else {
- -                              start = ext4_ext_next_allocated_block(path);
+ +                              *iterator = ext4_ext_next_allocated_block(path);
                                 continue;
                         }
                 }
+ +
+ +              if (SHIFT == SHIFT_LEFT) {
+ +                      extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+ +                      *iterator = le32_to_cpu(extent->ee_block) +
+ +                                      ext4_ext_get_actual_len(extent);
+ +              } else {
+ +                      extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
+ +                      *iterator =  le32_to_cpu(extent->ee_block) > 0 ?
+ +                              le32_to_cpu(extent->ee_block) - 1 : 0;
+ +                      /* Update path extent in case we need to stop */
+ +                      while (le32_to_cpu(extent->ee_block) < start)
+ +                              extent++;
+ +                      path[depth].p_ext = extent;
+ +              }
                 ret = ext4_ext_shift_path_extents(path, shift, inode,
- -                              handle, &start);
+ +                              handle, SHIFT);
                 if (ret)
                         break;
         }
@@@ -5461,14 -5397,6 +5462,14 @@@ int ext4_collapse_range(struct inode *i
         loff_t new_size, ioffset;
         int ret;
   
+ +      /*
+ +       * We need to test this early because xfstests assumes that a
+ +       * collapse range of (0, 1) will return EOPNOTSUPP if the file
+ +       * system does not support collapse range.
+ +       */
+ +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ +              return -EOPNOTSUPP;
+ +
         /* Collapse range works only on fs block size aligned offsets. */
         if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
             len & (EXT4_CLUSTER_SIZE(sb) - 1))
@@@ -5550,7 -5478,7 +5551,7 @@@
         ext4_discard_preallocations(inode);
   
         ret = ext4_ext_shift_extents(inode, handle, punch_stop,
- -                                   punch_stop - punch_start);
+ +                                   punch_stop - punch_start, SHIFT_LEFT);
         if (ret) {
                 up_write(&EXT4_I(inode)->i_data_sem);
                 goto out_stop;
@@@ -5575,174 -5503,6 +5576,174 @@@ out_mutex
         return ret;
   }
   
+ +/*
+ + * ext4_insert_range:
+ + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate.
+ + * The data blocks starting from @offset to the EOF are shifted by @len
+ + * towards right to create a hole in the @inode. Inode size is increased
+ + * by len bytes.
+ + * Returns 0 on success, error otherwise.
+ + */
+ +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+ +{
+ +      struct super_block *sb = inode->i_sb;
+ +      handle_t *handle;
+ +      struct ext4_ext_path *path;
+ +      struct ext4_extent *extent;
+ +      ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0;
+ +      unsigned int credits, ee_len;
+ +      int ret = 0, depth, split_flag = 0;
+ +      loff_t ioffset;
+ +
+ +      /*
+ +       * We need to test this early because xfstests assumes that an
+ +       * insert range of (0, 1) will return EOPNOTSUPP if the file
+ +       * system does not support insert range.
+ +       */
+ +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ +              return -EOPNOTSUPP;
+ +
+ +      /* Insert range works only on fs block size aligned offsets. */
+ +      if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) ||
+ +                      len & (EXT4_CLUSTER_SIZE(sb) - 1))
+ +              return -EINVAL;
+ +
+ +      if (!S_ISREG(inode->i_mode))
+ +              return -EOPNOTSUPP;
+ +
+ +      trace_ext4_insert_range(inode, offset, len);
+ +
+ +      offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb);
+ +      len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb);
+ +
+ +      /* Call ext4_force_commit to flush all data in case of data=journal */
+ +      if (ext4_should_journal_data(inode)) {
+ +              ret = ext4_force_commit(inode->i_sb);
+ +              if (ret)
+ +                      return ret;
+ +      }
+ +
+ +      /*
+ +       * Need to round down to align start offset to page size boundary
+ +       * for page size > block size.
+ +       */
+ +      ioffset = round_down(offset, PAGE_SIZE);
+ +
+ +      /* Write out all dirty pages */
+ +      ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+ +                      LLONG_MAX);
+ +      if (ret)
+ +              return ret;
+ +
+ +      /* Take mutex lock */
+ +      mutex_lock(&inode->i_mutex);
+ +
+ +      /* Currently just for extent based files */
+ +      if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
+ +              ret = -EOPNOTSUPP;
+ +              goto out_mutex;
+ +      }
+ +
+ +      /* Check for wrap through zero */
+ +      if (inode->i_size + len > inode->i_sb->s_maxbytes) {
+ +              ret = -EFBIG;
+ +              goto out_mutex;
+ +      }
+ +
+ +      /* Offset should be less than i_size */
+ +      if (offset >= i_size_read(inode)) {
+ +              ret = -EINVAL;
+ +              goto out_mutex;
+ +      }
+ +
+ +      truncate_pagecache(inode, ioffset);
+ +
+ +      /* Wait for existing dio to complete */
+ +      ext4_inode_block_unlocked_dio(inode);
+ +      inode_dio_wait(inode);
+ +
+ +      credits = ext4_writepage_trans_blocks(inode);
+ +      handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
+ +      if (IS_ERR(handle)) {
+ +              ret = PTR_ERR(handle);
+ +              goto out_dio;
+ +      }
+ +
+ +      /* Expand file to avoid data loss if there is error while shifting */
+ +      inode->i_size += len;
+ +      EXT4_I(inode)->i_disksize += len;
+ +      inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+ +      ret = ext4_mark_inode_dirty(handle, inode);
+ +      if (ret)
+ +              goto out_stop;
+ +
+ +      down_write(&EXT4_I(inode)->i_data_sem);
+ +      ext4_discard_preallocations(inode);
+ +
+ +      path = ext4_find_extent(inode, offset_lblk, NULL, 0);
+ +      if (IS_ERR(path)) {
+ +              up_write(&EXT4_I(inode)->i_data_sem);
+ +              goto out_stop;
+ +      }
+ +
+ +      depth = ext_depth(inode);
+ +      extent = path[depth].p_ext;
+ +      if (extent) {
+ +              ee_start_lblk = le32_to_cpu(extent->ee_block);
+ +              ee_len = ext4_ext_get_actual_len(extent);
+ +
+ +              /*
+ +               * If offset_lblk is not the starting block of extent, split
+ +               * the extent @offset_lblk
+ +               */
+ +              if ((offset_lblk > ee_start_lblk) &&
+ +                              (offset_lblk < (ee_start_lblk + ee_len))) {
+ +                      if (ext4_ext_is_unwritten(extent))
+ +                              split_flag = EXT4_EXT_MARK_UNWRIT1 |
+ +                                      EXT4_EXT_MARK_UNWRIT2;
+ +                      ret = ext4_split_extent_at(handle, inode, &path,
+ +                                      offset_lblk, split_flag,
+ +                                      EXT4_EX_NOCACHE |
+ +                                      EXT4_GET_BLOCKS_PRE_IO |
+ +                                      EXT4_GET_BLOCKS_METADATA_NOFAIL);
+ +              }
+ +
+ +              ext4_ext_drop_refs(path);
+ +              kfree(path);
+ +              if (ret < 0) {
+ +                      up_write(&EXT4_I(inode)->i_data_sem);
+ +                      goto out_stop;
+ +              }
+ +      }
+ +
+ +      ret = ext4_es_remove_extent(inode, offset_lblk,
+ +                      EXT_MAX_BLOCKS - offset_lblk);
+ +      if (ret) {
+ +              up_write(&EXT4_I(inode)->i_data_sem);
+ +              goto out_stop;
+ +      }
+ +
+ +      /*
+ +       * if offset_lblk lies in a hole which is at start of file, use
+ +       * ee_start_lblk to shift extents
+ +       */
+ +      ret = ext4_ext_shift_extents(inode, handle,
+ +              ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk,
+ +              len_lblk, SHIFT_RIGHT);
+ +
+ +      up_write(&EXT4_I(inode)->i_data_sem);
+ +      if (IS_SYNC(inode))
+ +              ext4_handle_sync(handle);
+ +
+ +out_stop:
+ +      ext4_journal_stop(handle);
+ +out_dio:
+ +      ext4_inode_resume_unlocked_dio(inode);
+ +out_mutex:
+ +      mutex_unlock(&inode->i_mutex);
+ +      return ret;
+ +}
+ +
   /**
    * ext4_swap_extents - Swap extents between two inodes
    *
@@@ -5775,7 -5535,7 +5776,7 @@@ ext4_swap_extents(handle_t *handle, str
         BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
         BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
         BUG_ON(!mutex_is_locked(&inode1->i_mutex));
- -      BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+ +      BUG_ON(!mutex_is_locked(&inode2->i_mutex));
   
         *erp = ext4_es_remove_extent(inode1, lblk1, count);
         if (unlikely(*erp))
diff --combined fs/ext4/mballoc.c

index 1c535fa67640da69def57f0e88f5c8d5e233c0bc,440987c8ba9ef2dc27eee2092e244d6f4c32bb7b..f6aedf88da437ee324c314bb1020baa51db0423c
--- 1/fs/ext4/mballoc.c
--- 2/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@@ -26,6 -26,7 +26,7 @@@
   #include <linux/log2.h>
   #include <linux/module.h>
   #include <linux/slab.h>
+ #include <linux/backing-dev.h>
   #include <trace/events/ext4.h>
   
   #ifdef CONFIG_EXT4_DEBUG
@@@ -882,8 -883,10 +883,8 @@@ static int ext4_mb_init_cache(struct pa
   
         /* wait for I/O completion */
         for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
- -              if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+ +              if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i]))
                         err = -EIO;
- -                      goto out;
- -              }
         }
   
         first_block = page->index * blocks_per_page;
@@@ -896,11 -899,6 +897,11 @@@
                         /* skip initialized uptodate buddy */
                         continue;
   
+ +              if (!buffer_verified(bh[group - first_group]))
+ +                      /* Skip faulty bitmaps */
+ +                      continue;
+ +              err = 0;
+ +
                 /*
                  * data carry information regarding this
                  * particular group in the format specified
@@@ -2011,12 -2009,7 +2012,12 @@@ void ext4_mb_scan_aligned(struct ext4_a
         }
   }
   
- -/* This is now called BEFORE we load the buddy bitmap. */
+ +/*
+ + * This is now called BEFORE we load the buddy bitmap.
+ + * Returns either 1 or 0 indicating that the group is either suitable
+ + * for the allocation or not. In addition it can also return negative
+ + * error code when something goes wrong.
+ + */
   static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                                 ext4_group_t group, int cr)
   {
@@@ -2039,7 -2032,7 +2040,7 @@@
         if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
                 int ret = ext4_mb_init_group(ac->ac_sb, group);
                 if (ret)
- -                      return 0;
+ +                      return ret;
         }
   
         fragments = grp->bb_fragments;
@@@ -2086,7 -2079,7 +2087,7 @@@ ext4_mb_regular_allocator(struct ext4_a
   {
         ext4_group_t ngroups, group, i;
         int cr;
- -      int err = 0;
+ +      int err = 0, first_err = 0;
         struct ext4_sb_info *sbi;
         struct super_block *sb;
         struct ext4_buddy e4b;
@@@ -2153,7 -2146,6 +2154,7 @@@ repeat
                 group = ac->ac_g_ex.fe_group;
   
                 for (i = 0; i < ngroups; group++, i++) {
+ +                      int ret = 0;
                         cond_resched();
                         /*
                          * Artificially restricted ngroups for non-extent
@@@ -2163,12 -2155,8 +2164,12 @@@
                                 group = 0;
   
                         /* This now checks without needing the buddy page */
- -                      if (!ext4_mb_good_group(ac, group, cr))
+ +                      ret = ext4_mb_good_group(ac, group, cr);
+ +                      if (ret <= 0) {
+ +                              if (!first_err)
+ +                                      first_err = ret;
                                 continue;
+ +                      }
   
                         err = ext4_mb_load_buddy(sb, group, &e4b);
                         if (err)
@@@ -2180,12 -2168,9 +2181,12 @@@
                          * We need to check again after locking the
                          * block group
                          */
- -                      if (!ext4_mb_good_group(ac, group, cr)) {
+ +                      ret = ext4_mb_good_group(ac, group, cr);
+ +                      if (ret <= 0) {
                                 ext4_unlock_group(sb, group);
                                 ext4_mb_unload_buddy(&e4b);
+ +                              if (!first_err)
+ +                                      first_err = ret;
                                 continue;
                         }
   
@@@ -2232,8 -2217,6 +2233,8 @@@
                 }
         }
   out:
+ +      if (!err && ac->ac_status != AC_STATUS_FOUND && first_err)
+ +              err = first_err;
         return err;
   }
   
@@@ -2275,9 -2258,12 +2276,9 @@@ static int ext4_mb_seq_groups_show(stru
   
         group--;
         if (group == 0)
- -              seq_printf(seq, "#%-5s: %-5s %-5s %-5s "
- -                              "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s "
- -                                "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n",
- -                         "group", "free", "frags", "first",
- -                         "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6",
- -                         "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13");
+ +              seq_puts(seq, "#group: free  frags first ["
+ +                            " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
+ +                            " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
   
         i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
                 sizeof(struct ext4_group_info);
diff --combined fs/ext4/super.c

index 90ec13fe8ac73e5d81cda8ce4d267afbf2ff571a,56b8bb75c3fc3b8809bf310dde5df49139caeb35..a7b4b6e1026920823a149b2f124371cba092e387
--- 1/fs/ext4/super.c
--- 2/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@@ -24,6 -24,7 +24,7 @@@
   #include <linux/slab.h>
   #include <linux/init.h>
   #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
   #include <linux/parser.h>
   #include <linux/buffer_head.h>
   #include <linux/exportfs.h>
@@@ -294,8 -295,6 +295,8 @@@ static void __save_error_info(struct su
         struct ext4_super_block *es = EXT4_SB(sb)->s_es;
   
         EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
+ +      if (bdev_read_only(sb->s_bdev))
+ +              return;
         es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
         es->s_last_error_time = cpu_to_le32(get_seconds());
         strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
@@@ -591,17 -590,14 +592,17 @@@ void __ext4_msg(struct super_block *sb
         va_end(args);
   }
   
+ +#define ext4_warning_ratelimit(sb)                                    \
+ +              ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \
+ +                           "EXT4-fs warning")
+ +
   void __ext4_warning(struct super_block *sb, const char *function,
                     unsigned int line, const char *fmt, ...)
   {
         struct va_format vaf;
         va_list args;
   
- -      if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state),
- -                        "EXT4-fs warning"))
+ +      if (!ext4_warning_ratelimit(sb))
                 return;
   
         va_start(args, fmt);
@@@ -612,24 -608,6 +613,24 @@@
         va_end(args);
   }
   
+ +void __ext4_warning_inode(const struct inode *inode, const char *function,
+ +                        unsigned int line, const char *fmt, ...)
+ +{
+ +      struct va_format vaf;
+ +      va_list args;
+ +
+ +      if (!ext4_warning_ratelimit(inode->i_sb))
+ +              return;
+ +
+ +      va_start(args, fmt);
+ +      vaf.fmt = fmt;
+ +      vaf.va = &args;
+ +      printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: "
+ +             "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id,
+ +             function, line, inode->i_ino, current->comm, &vaf);
+ +      va_end(args);
+ +}
+ +
   void __ext4_grp_locked_error(const char *function, unsigned int line,
                              struct super_block *sb, ext4_group_t grp,
                              unsigned long ino, ext4_fsblk_t block,
@@@ -828,7 -806,6 +829,7 @@@ static void ext4_put_super(struct super
                 dump_orphan_list(sb, sbi);
         J_ASSERT(list_empty(&sbi->s_orphan));
   
+ +      sync_blockdev(sb->s_bdev);
         invalidate_bdev(sb->s_bdev);
         if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
                 /*
@@@ -901,8 -878,9 +902,8 @@@ static struct inode *ext4_alloc_inode(s
         atomic_set(&ei->i_unwritten, 0);
         INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
   #ifdef CONFIG_EXT4_FS_ENCRYPTION
- -      ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID;
+ +      ei->i_crypt_info = NULL;
   #endif
- -
         return &ei->vfs_inode;
   }
   
@@@ -979,10 -957,6 +980,10 @@@ void ext4_clear_inode(struct inode *ino
                 jbd2_free_inode(EXT4_I(inode)->jinode);
                 EXT4_I(inode)->jinode = NULL;
         }
+ +#ifdef CONFIG_EXT4_FS_ENCRYPTION
+ +      if (EXT4_I(inode)->i_crypt_info)
+ +              ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info);
+ +#endif
   }
   
   static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@@ -3474,6 -3448,11 +3475,6 @@@ static int ext4_fill_super(struct super
         if (sb->s_bdev->bd_part)
                 sbi->s_sectors_written_start =
                         part_stat_read(sb->s_bdev->bd_part, sectors[1]);
- -#ifdef CONFIG_EXT4_FS_ENCRYPTION
- -      /* Modes of operations for file and directory encryption. */
- -      sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS;
- -      sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID;
- -#endif
   
         /* Cleanup superblock name */
         for (cp = sb->s_id; (cp = strchr(cp, '/'));)
@@@ -4087,15 -4066,7 +4088,15 @@@ no_journal
                 }
         }
   
- -      if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) &&
+ +      if ((DUMMY_ENCRYPTION_ENABLED(sbi) ||
+ +           EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) &&
+ +          (blocksize != PAGE_CACHE_SIZE)) {
+ +              ext4_msg(sb, KERN_ERR,
+ +                       "Unsupported blocksize for fs encryption");
+ +              goto failed_mount_wq;
+ +      }
+ +
+ +      if (DUMMY_ENCRYPTION_ENABLED(sbi) &&
             !(sb->s_flags & MS_RDONLY) &&
             !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) {
                 EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT);
@@@ -4971,9 -4942,6 +4972,9 @@@ static int ext4_remount(struct super_bl
                 set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
         }
   
+ +      if (*flags & MS_LAZYTIME)
+ +              sb->s_flags |= MS_LAZYTIME;
+ +
         if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
                 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
                         err = -EROFS;
@@@ -5441,7 -5409,6 +5442,7 @@@ static ssize_t ext4_quota_write(struct 
         struct inode *inode = sb_dqopt(sb)->files[type];
         ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb);
         int err, offset = off & (sb->s_blocksize - 1);
+ +      int retries = 0;
         struct buffer_head *bh;
         handle_t *handle = journal_current_handle();
   
@@@ -5462,12 -5429,7 +5463,12 @@@
                 return -EIO;
         }
   
- -      bh = ext4_bread(handle, inode, blk, 1);
+ +      do {
+ +              bh = ext4_bread(handle, inode, blk,
+ +                              EXT4_GET_BLOCKS_CREATE |
+ +                              EXT4_GET_BLOCKS_METADATA_NOFAIL);
+ +      } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) &&
+ +               ext4_should_retry_alloc(inode->i_sb, &retries));
         if (IS_ERR(bh))
                 return PTR_ERR(bh);
         if (!bh)
@@@ -5684,7 -5646,6 +5685,7 @@@ out7
   
   static void __exit ext4_exit_fs(void)
   {
+ +      ext4_exit_crypto();
         ext4_destroy_lazyinit_thread();
         unregister_as_ext2();
         unregister_as_ext3();
diff --combined fs/f2fs/node.c

index d9c52424bac21555f5a84a2f8ef9f7f2dc7751c5,d211602e0f86f7396553cf422439a94e118a22cc..7dd63b794bfb5a04ae0d203c9ed2c8e739f0eb08
--- 1/fs/f2fs/node.c
--- 2/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@@ -53,7 -53,7 +53,7 @@@ bool available_free_memory(struct f2fs_
                                                         PAGE_CACHE_SHIFT;
                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
         } else if (type == DIRTY_DENTS) {
-               if (sbi->sb->s_bdi->dirty_exceeded)
+               if (sbi->sb->s_bdi->wb.dirty_exceeded)
                         return false;
                 mem_size = get_pages(sbi, F2FS_DIRTY_DENTS);
                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
@@@ -70,7 -70,7 +70,7 @@@
                                 sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
         } else {
-               if (sbi->sb->s_bdi->dirty_exceeded)
+               if (sbi->sb->s_bdi->wb.dirty_exceeded)
                         return false;
         }
         return res;
@@@ -195,35 -195,32 +195,35 @@@ static unsigned int __gang_lookup_nat_s
                                                         start, nr);
   }
   
- -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
+ +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid)
   {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
         struct nat_entry *e;
- -      bool is_cp = true;
+ +      bool need = false;
   
         down_read(&nm_i->nat_tree_lock);
         e = __lookup_nat_cache(nm_i, nid);
- -      if (e && !get_nat_flag(e, IS_CHECKPOINTED))
- -              is_cp = false;
+ +      if (e) {
+ +              if (!get_nat_flag(e, IS_CHECKPOINTED) &&
+ +                              !get_nat_flag(e, HAS_FSYNCED_INODE))
+ +                      need = true;
+ +      }
         up_read(&nm_i->nat_tree_lock);
- -      return is_cp;
+ +      return need;
   }
   
- -bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino)
+ +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid)
   {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
         struct nat_entry *e;
- -      bool fsynced = false;
+ +      bool is_cp = true;
   
         down_read(&nm_i->nat_tree_lock);
- -      e = __lookup_nat_cache(nm_i, ino);
- -      if (e && get_nat_flag(e, HAS_FSYNCED_INODE))
- -              fsynced = true;
+ +      e = __lookup_nat_cache(nm_i, nid);
+ +      if (e && !get_nat_flag(e, IS_CHECKPOINTED))
+ +              is_cp = false;
         up_read(&nm_i->nat_tree_lock);
- -      return fsynced;
+ +      return is_cp;
   }
   
   bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
@@@ -315,8 -312,7 +315,8 @@@ static void set_node_addr(struct f2fs_s
         __set_nat_cache_dirty(nm_i, e);
   
         /* update fsync_mark if its inode nat entry is still alive */
- -      e = __lookup_nat_cache(nm_i, ni->ino);
+ +      if (ni->nid != ni->ino)
+ +              e = __lookup_nat_cache(nm_i, ni->ino);
         if (e) {
                 if (fsync_done && ni->nid == ni->ino)
                         set_nat_flag(e, HAS_FSYNCED_INODE, true);
@@@ -999,11 -995,8 +999,11 @@@ static int read_node_page(struct page *
         struct f2fs_sb_info *sbi = F2FS_P_SB(page);
         struct node_info ni;
         struct f2fs_io_info fio = {
+ +              .sbi = sbi,
                 .type = NODE,
                 .rw = rw,
+ +              .page = page,
+ +              .encrypted_page = NULL,
         };
   
         get_node_info(sbi, page->index, &ni);
@@@ -1018,7 -1011,7 +1018,7 @@@
                 return LOCKED_PAGE;
   
         fio.blk_addr = ni.blk_addr;
- -      return f2fs_submit_page_bio(sbi, page, &fio);
+ +      return f2fs_submit_page_bio(&fio);
   }
   
   /*
@@@ -1211,9 -1204,13 +1211,9 @@@ continue_unlock
                         /* called by fsync() */
                         if (ino && IS_DNODE(page)) {
                                 set_fsync_mark(page, 1);
- -                              if (IS_INODE(page)) {
- -                                      if (!is_checkpointed_node(sbi, ino) &&
- -                                              !has_fsynced_inode(sbi, ino))
- -                                              set_dentry_mark(page, 1);
- -                                      else
- -                                              set_dentry_mark(page, 0);
- -                              }
+ +                              if (IS_INODE(page))
+ +                                      set_dentry_mark(page,
+ +                                              need_dentry_mark(sbi, ino));
                                 nwritten++;
                         } else {
                                 set_fsync_mark(page, 0);
@@@ -1296,11 -1293,8 +1296,11 @@@ static int f2fs_write_node_page(struct 
         nid_t nid;
         struct node_info ni;
         struct f2fs_io_info fio = {
+ +              .sbi = sbi,
                 .type = NODE,
                 .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE,
+ +              .page = page,
+ +              .encrypted_page = NULL,
         };
   
         trace_f2fs_writepage(page, NODE);
@@@ -1335,7 -1329,7 +1335,7 @@@
   
         set_page_writeback(page);
         fio.blk_addr = ni.blk_addr;
- -      write_node_page(sbi, page, nid, &fio);
+ +      write_node_page(nid, &fio);
         set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
         dec_page_count(sbi, F2FS_DIRTY_NODES);
         up_read(&sbi->node_write);
diff --combined fs/f2fs/segment.h

index 8496357781188188126c1de28afc55f347d10198,aba72f7a8ac4b45e8b05111cd15f42bb1b26e0cd..79e7b879a75321047bf00fd22d6b55d37a270af7
--- 1/fs/f2fs/segment.h
--- 2/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@@ -9,6 -9,7 +9,7 @@@
    * published by the Free Software Foundation.
    */
   #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
   
   /* constant macro */
   #define NULL_SEGNO                    ((unsigned int)(~0))
@@@ -163,7 -164,6 +164,7 @@@ struct seg_entry 
          */
         unsigned short ckpt_valid_blocks;
         unsigned char *ckpt_valid_map;
+ +      unsigned char *discard_map;
         unsigned char type;             /* segment type like CURSEG_XXX_TYPE */
         unsigned long long mtime;       /* modification time of the segment */
   };
@@@ -714,7 -714,7 +715,7 @@@ static inline unsigned int max_hw_block
    */
   static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
   {
-       if (sbi->sb->s_bdi->dirty_exceeded)
+       if (sbi->sb->s_bdi->wb.dirty_exceeded)
                 return 0;
   
         if (type == DATA)
diff --combined fs/inode.c

index e8d62688ed9181e511e2a0e8c6a5f36840cdbe94,efc9edacfb9b4a5da04231e7692b16d59b44736e..069721f0cc0e0b733bb659fb0d7836cd71499690
--- 1/fs/inode.c
--- 2/fs/inode.c
+++ b/fs/inode.c
@@@ -152,7 -152,6 +152,7 @@@ int inode_init_always(struct super_bloc
         inode->i_pipe = NULL;
         inode->i_bdev = NULL;
         inode->i_cdev = NULL;
+ +      inode->i_link = NULL;
         inode->i_rdev = 0;
         inode->dirtied_when = 0;
   
@@@ -224,6 -223,7 +224,7 @@@ EXPORT_SYMBOL(free_inode_nonrcu)
   void __destroy_inode(struct inode *inode)
   {
         BUG_ON(inode_has_buffers(inode));
+       inode_detach_wb(inode);
         security_inode_free(inode);
         fsnotify_inode_delete(inode);
         locks_free_lock_context(inode->i_flctx);
@@@ -1585,47 -1585,36 +1586,47 @@@ static int update_time(struct inode *in
    *    This function automatically handles read only file systems and media,
    *    as well as the "noatime" flag and inode specific "noatime" markers.
    */
- -void touch_atime(const struct path *path)
+ +bool atime_needs_update(const struct path *path, struct inode *inode)
   {
         struct vfsmount *mnt = path->mnt;
- -      struct inode *inode = d_inode(path->dentry);
         struct timespec now;
   
         if (inode->i_flags & S_NOATIME)
- -              return;
+ +              return false;
         if (IS_NOATIME(inode))
- -              return;
+ +              return false;
         if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
- -              return;
+ +              return false;
   
         if (mnt->mnt_flags & MNT_NOATIME)
- -              return;
+ +              return false;
         if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
- -              return;
+ +              return false;
   
         now = current_fs_time(inode->i_sb);
   
         if (!relatime_need_update(mnt, inode, now))
- -              return;
+ +              return false;
   
         if (timespec_equal(&inode->i_atime, &now))
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
+ +void touch_atime(const struct path *path)
+ +{
+ +      struct vfsmount *mnt = path->mnt;
+ +      struct inode *inode = d_inode(path->dentry);
+ +      struct timespec now;
+ +
+ +      if (!atime_needs_update(path, inode))
                 return;
   
         if (!sb_start_write_trylock(inode->i_sb))
                 return;
   
- -      if (__mnt_want_write(mnt))
+ +      if (__mnt_want_write(mnt) != 0)
                 goto skip_update;
         /*
          * File systems can error out when updating inodes if they need to
@@@ -1636,7 -1625,6 +1637,7 @@@
          * We may also fail on filesystems that have the ability to make parts
          * of the fs read only, e.g. subvolumes in Btrfs.
          */
+ +      now = current_fs_time(inode->i_sb);
         update_time(inode, &now, S_ATIME);
         __mnt_drop_write(mnt);
   skip_update:
diff --combined fs/nfs/write.c

index dfc19f1575a19d00bee1b0aeef6575e4416a9ef9,94c7ce01dfb1b27403ca706ab73fe48445163570..e6c262555e08a62aff65ef3baa04e9666e9f18c2
--- 1/fs/nfs/write.c
--- 2/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@@ -853,7 -853,8 +853,8 @@@ static voi
   nfs_clear_page_commit(struct page *page)
   {
         dec_zone_page_state(page, NR_UNSTABLE_NFS);
-       dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
+       dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb,
+                   WB_RECLAIMABLE);
   }
   
   /* Called holding inode (/cinfo) lock */
@@@ -1845,15 -1846,12 +1846,15 @@@ int nfs_wb_all(struct inode *inode
         trace_nfs_writeback_inode_enter(inode);
   
         ret = filemap_write_and_wait(inode->i_mapping);
- -      if (!ret) {
- -              ret = nfs_commit_inode(inode, FLUSH_SYNC);
- -              if (!ret)
- -                      pnfs_sync_inode(inode, true);
- -      }
+ +      if (ret)
+ +              goto out;
+ +      ret = nfs_commit_inode(inode, FLUSH_SYNC);
+ +      if (ret < 0)
+ +              goto out;
+ +      pnfs_sync_inode(inode, true);
+ +      ret = 0;
   
+ +out:
         trace_nfs_writeback_inode_exit(inode, ret);
         return ret;
   }
diff --combined fs/ocfs2/file.c

index fbfadb289e628ce32decb024bab92792e4ebc995,8f1feca89fb08fda21d050e3fcc49e1bb4c88398..719f7f4c7a37bd8cfb292fed77756baed9bacb0b
--- 1/fs/ocfs2/file.c
--- 2/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@@ -37,6 -37,7 +37,7 @@@
   #include <linux/falloc.h>
   #include <linux/quotaops.h>
   #include <linux/blkdev.h>
+ #include <linux/backing-dev.h>
   
   #include <cluster/masklog.h>
   
@@@ -2250,7 -2251,7 +2251,7 @@@ out
   static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
                                     struct iov_iter *from)
   {
- -      int direct_io, appending, rw_level, have_alloc_sem  = 0;
+ +      int direct_io, appending, rw_level;
         int can_do_direct, has_refcount = 0;
         ssize_t written = 0;
         ssize_t ret;
@@@ -2279,7 -2280,16 +2280,7 @@@
   
         mutex_lock(&inode->i_mutex);
   
- -      ocfs2_iocb_clear_sem_locked(iocb);
- -
   relock:
- -      /* to match setattr's i_mutex -> rw_lock ordering */
- -      if (direct_io) {
- -              have_alloc_sem = 1;
- -              /* communicate with ocfs2_dio_end_io */
- -              ocfs2_iocb_set_sem_locked(iocb);
- -      }
- -
         /*
          * Concurrent O_DIRECT writes are allowed with
          * mount_option "coherency=buffered".
@@@ -2289,7 -2299,7 +2290,7 @@@
         ret = ocfs2_rw_lock(inode, rw_level);
         if (ret < 0) {
                 mlog_errno(ret);
- -              goto out_sems;
+ +              goto out_mutex;
         }
   
         /*
@@@ -2338,6 -2348,7 +2339,6 @@@
         if (direct_io && !can_do_direct) {
                 ocfs2_rw_unlock(inode, rw_level);
   
- -              have_alloc_sem = 0;
                 rw_level = -1;
   
                 direct_io = 0;
@@@ -2406,6 -2417,7 +2407,6 @@@ no_sync
          */
         if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
                 rw_level = -1;
- -              have_alloc_sem = 0;
                 unaligned_dio = 0;
         }
   
@@@ -2418,7 -2430,10 +2419,7 @@@ out
         if (rw_level != -1)
                 ocfs2_rw_unlock(inode, rw_level);
   
- -out_sems:
- -      if (have_alloc_sem)
- -              ocfs2_iocb_clear_sem_locked(iocb);
- -
+ +out_mutex:
         mutex_unlock(&inode->i_mutex);
   
         if (written)
@@@ -2459,7 -2474,7 +2460,7 @@@ bail
   static ssize_t ocfs2_file_read_iter(struct kiocb *iocb,
                                    struct iov_iter *to)
   {
- -      int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0;
+ +      int ret = 0, rw_level = -1, lock_level = 0;
         struct file *filp = iocb->ki_filp;
         struct inode *inode = file_inode(filp);
   
@@@ -2476,11 -2491,16 +2477,11 @@@
                 goto bail;
         }
   
- -      ocfs2_iocb_clear_sem_locked(iocb);
- -
         /*
          * buffered reads protect themselves in ->readpage().  O_DIRECT reads
          * need locks to protect pending reads from racing with truncate.
          */
         if (iocb->ki_flags & IOCB_DIRECT) {
- -              have_alloc_sem = 1;
- -              ocfs2_iocb_set_sem_locked(iocb);
- -
                 ret = ocfs2_rw_lock(inode, 0);
                 if (ret < 0) {
                         mlog_errno(ret);
@@@ -2516,9 -2536,13 +2517,9 @@@
         /* see ocfs2_file_write_iter */
         if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) {
                 rw_level = -1;
- -              have_alloc_sem = 0;
         }
   
   bail:
- -      if (have_alloc_sem)
- -              ocfs2_iocb_clear_sem_locked(iocb);
- -
         if (rw_level != -1)
                 ocfs2_rw_unlock(inode, rw_level);
   
diff --combined fs/xfs/xfs_file.c

index 3b7591224f4a6698d32371a927e70cb2a391f4a9,4e00b38efbe0925f834999401e286682c630dd72..7c62fca53e2fc36b5c61f37a829e6532b7e210d2
--- 1/fs/xfs/xfs_file.c
--- 2/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@@ -41,6 -41,7 +41,7 @@@
   #include <linux/dcache.h>
   #include <linux/falloc.h>
   #include <linux/pagevec.h>
+ #include <linux/backing-dev.h>
   
   static const struct vm_operations_struct xfs_file_vm_ops;
   
@@@ -124,7 -125,7 +125,7 @@@ xfs_iozero
                 status = 0;
         } while (count);
   
- -      return (-status);
+ +      return status;
   }
   
   int
diff --combined include/linux/backing-dev.h

index d87d8eced06407c59c6d231f9e707bdcc398ce52,a13181a42b9aee87a652d1a94a7e322dc6aaaf6e..0e6d4828a77a358edd3c77ef7d14eecc6f6001b3
--- 1/include/linux/backing-dev.h
--- 2/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@@ -8,106 -8,13 +8,13 @@@
   #ifndef _LINUX_BACKING_DEV_H
   #define _LINUX_BACKING_DEV_H
   
- #include <linux/percpu_counter.h>
- #include <linux/log2.h>
- #include <linux/flex_proportions.h>
   #include <linux/kernel.h>
   #include <linux/fs.h>
   #include <linux/sched.h>
- #include <linux/timer.h>
+ #include <linux/blkdev.h>
   #include <linux/writeback.h>
- #include <linux/atomic.h>
- #include <linux/sysctl.h>
- #include <linux/workqueue.h>
- 
- struct page;
- struct device;
- struct dentry;
- 
- /*
-  * Bits in backing_dev_info.state
-  */
- enum bdi_state {
-       BDI_async_congested,    /* The async (write) queue is getting full */
-       BDI_sync_congested,     /* The sync queue is getting full */
-       BDI_registered,         /* bdi_register() was done */
-       BDI_writeback_running,  /* Writeback is in progress */
- };
- 
- typedef int (congested_fn)(void *, int);
- 
- enum bdi_stat_item {
-       BDI_RECLAIMABLE,
-       BDI_WRITEBACK,
-       BDI_DIRTIED,
-       BDI_WRITTEN,
-       NR_BDI_STAT_ITEMS
- };
- 
- #define BDI_STAT_BATCH (8*(1+ilog2(nr_cpu_ids)))
- 
- struct bdi_writeback {
-       struct backing_dev_info *bdi;   /* our parent bdi */
- 
-       unsigned long last_old_flush;   /* last old data flush */
- 
-       struct delayed_work dwork;      /* work item used for writeback */
-       struct list_head b_dirty;       /* dirty inodes */
-       struct list_head b_io;          /* parked for writeback */
-       struct list_head b_more_io;     /* parked for more writeback */
-       struct list_head b_dirty_time;  /* time stamps are dirty */
-       spinlock_t list_lock;           /* protects the b_* lists */
- };
- 
- struct backing_dev_info {
-       struct list_head bdi_list;
-       unsigned long ra_pages; /* max readahead in PAGE_CACHE_SIZE units */
-       unsigned long state;    /* Always use atomic bitops on this */
-       unsigned int capabilities; /* Device capabilities */
-       congested_fn *congested_fn; /* Function pointer if device is md/dm */
-       void *congested_data;   /* Pointer to aux data for congested func */
- 
-       char *name;
- 
-       struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
- 
-       unsigned long bw_time_stamp;    /* last time write bw is updated */
-       unsigned long dirtied_stamp;
-       unsigned long written_stamp;    /* pages written at bw_time_stamp */
-       unsigned long write_bandwidth;  /* the estimated write bandwidth */
-       unsigned long avg_write_bandwidth; /* further smoothed write bw */
- 
-       /*
-        * The base dirty throttle rate, re-calculated on every 200ms.
-        * All the bdi tasks' dirty rate will be curbed under it.
-        * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
-        * in small steps and is much more smooth/stable than the latter.
-        */
-       unsigned long dirty_ratelimit;
-       unsigned long balanced_dirty_ratelimit;
- 
-       struct fprop_local_percpu completions;
-       int dirty_exceeded;
- 
-       unsigned int min_ratio;
-       unsigned int max_ratio, max_prop_frac;
- 
-       struct bdi_writeback wb;  /* default writeback info for this bdi */
-       spinlock_t wb_lock;       /* protects work_list & wb.dwork scheduling */
- 
-       struct list_head work_list;
- 
-       struct device *dev;
- 
-       struct timer_list laptop_mode_wb_timer;
- 
- #ifdef CONFIG_DEBUG_FS
-       struct dentry *debug_dir;
-       struct dentry *debug_stats;
- #endif
- };
- 
- struct backing_dev_info *inode_to_bdi(struct inode *inode);
+ #include <linux/blk-cgroup.h>
+ #include <linux/backing-dev-defs.h>
   
   int __must_check bdi_init(struct backing_dev_info *bdi);
   void bdi_destroy(struct backing_dev_info *bdi);
@@@ -116,98 -23,101 +23,100 @@@ __printf(3, 4
   int bdi_register(struct backing_dev_info *bdi, struct device *parent,
                 const char *fmt, ...);
   int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev);
- -void bdi_unregister(struct backing_dev_info *bdi);
   int __must_check bdi_setup_and_register(struct backing_dev_info *, char *);
- void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-                       enum wb_reason reason);
- void bdi_start_background_writeback(struct backing_dev_info *bdi);
- void bdi_writeback_workfn(struct work_struct *work);
- int bdi_has_dirty_io(struct backing_dev_info *bdi);
- void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+ void wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
+                       bool range_cyclic, enum wb_reason reason);
+ void wb_start_background_writeback(struct bdi_writeback *wb);
+ void wb_workfn(struct work_struct *work);
+ void wb_wakeup_delayed(struct bdi_writeback *wb);
   
   extern spinlock_t bdi_lock;
   extern struct list_head bdi_list;
   
   extern struct workqueue_struct *bdi_wq;
   
- static inline int wb_has_dirty_io(struct bdi_writeback *wb)
+ static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
   {
-       return !list_empty(&wb->b_dirty) ||
-              !list_empty(&wb->b_io) ||
-              !list_empty(&wb->b_more_io);
+       return test_bit(WB_has_dirty_io, &wb->state);
+ }
+ 
+ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
+ {
+       /*
+        * @bdi->tot_write_bandwidth is guaranteed to be > 0 if there are
+        * any dirty wbs.  See wb_update_write_bandwidth().
+        */
+       return atomic_long_read(&bdi->tot_write_bandwidth);
   }
   
- static inline void __add_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item, s64 amount)
+ static inline void __add_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item, s64 amount)
   {
-       __percpu_counter_add(&bdi->bdi_stat[item], amount, BDI_STAT_BATCH);
+       __percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
   }
   
- static inline void __inc_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void __inc_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item)
   {
-       __add_bdi_stat(bdi, item, 1);
+       __add_wb_stat(wb, item, 1);
   }
   
- static inline void inc_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void inc_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
   {
         unsigned long flags;
   
         local_irq_save(flags);
-       __inc_bdi_stat(bdi, item);
+       __inc_wb_stat(wb, item);
         local_irq_restore(flags);
   }
   
- static inline void __dec_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void __dec_wb_stat(struct bdi_writeback *wb,
+                                enum wb_stat_item item)
   {
-       __add_bdi_stat(bdi, item, -1);
+       __add_wb_stat(wb, item, -1);
   }
   
- static inline void dec_bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline void dec_wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
   {
         unsigned long flags;
   
         local_irq_save(flags);
-       __dec_bdi_stat(bdi, item);
+       __dec_wb_stat(wb, item);
         local_irq_restore(flags);
   }
   
- static inline s64 bdi_stat(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline s64 wb_stat(struct bdi_writeback *wb, enum wb_stat_item item)
   {
-       return percpu_counter_read_positive(&bdi->bdi_stat[item]);
+       return percpu_counter_read_positive(&wb->stat[item]);
   }
   
- static inline s64 __bdi_stat_sum(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline s64 __wb_stat_sum(struct bdi_writeback *wb,
+                               enum wb_stat_item item)
   {
-       return percpu_counter_sum_positive(&bdi->bdi_stat[item]);
+       return percpu_counter_sum_positive(&wb->stat[item]);
   }
   
- static inline s64 bdi_stat_sum(struct backing_dev_info *bdi,
-               enum bdi_stat_item item)
+ static inline s64 wb_stat_sum(struct bdi_writeback *wb, enum wb_stat_item item)
   {
         s64 sum;
         unsigned long flags;
   
         local_irq_save(flags);
-       sum = __bdi_stat_sum(bdi, item);
+       sum = __wb_stat_sum(wb, item);
         local_irq_restore(flags);
   
         return sum;
   }
   
- extern void bdi_writeout_inc(struct backing_dev_info *bdi);
+ extern void wb_writeout_inc(struct bdi_writeback *wb);
   
   /*
    * maximal error of a stat counter.
    */
- static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
+ static inline unsigned long wb_stat_error(struct bdi_writeback *wb)
   {
   #ifdef CONFIG_SMP
-       return nr_cpu_ids * BDI_STAT_BATCH;
+       return nr_cpu_ids * WB_STAT_BATCH;
   #else
         return 1;
   #endif
@@@ -231,50 -141,57 +140,57 @@@ int bdi_set_max_ratio(struct backing_de
    * BDI_CAP_NO_WRITEBACK:   Don't write pages back
    * BDI_CAP_NO_ACCT_WB:     Don't automatically account writeback pages
    * BDI_CAP_STRICTLIMIT:    Keep number of dirty pages below bdi threshold.
+  *
+  * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback.
    */
   #define BDI_CAP_NO_ACCT_DIRTY 0x00000001
   #define BDI_CAP_NO_WRITEBACK  0x00000002
   #define BDI_CAP_NO_ACCT_WB    0x00000004
   #define BDI_CAP_STABLE_WRITES 0x00000008
   #define BDI_CAP_STRICTLIMIT   0x00000010
+ #define BDI_CAP_CGROUP_WRITEBACK 0x00000020
   
   #define BDI_CAP_NO_ACCT_AND_WRITEBACK \
         (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB)
   
   extern struct backing_dev_info noop_backing_dev_info;
   
- int writeback_in_progress(struct backing_dev_info *bdi);
- 
- static inline int bdi_congested(struct backing_dev_info *bdi, int bdi_bits)
+ /**
+  * writeback_in_progress - determine whether there is writeback in progress
+  * @wb: bdi_writeback of interest
+  *
+  * Determine whether there is writeback waiting to be handled against a
+  * bdi_writeback.
+  */
+ static inline bool writeback_in_progress(struct bdi_writeback *wb)
   {
-       if (bdi->congested_fn)
-               return bdi->congested_fn(bdi->congested_data, bdi_bits);
-       return (bdi->state & bdi_bits);
+       return test_bit(WB_writeback_running, &wb->state);
   }
   
- static inline int bdi_read_congested(struct backing_dev_info *bdi)
+ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
   {
-       return bdi_congested(bdi, 1 << BDI_sync_congested);
- }
+       struct super_block *sb;
   
- static inline int bdi_write_congested(struct backing_dev_info *bdi)
- {
-       return bdi_congested(bdi, 1 << BDI_async_congested);
+       if (!inode)
+               return &noop_backing_dev_info;
+ 
+       sb = inode->i_sb;
+ #ifdef CONFIG_BLOCK
+       if (sb_is_blkdev_sb(sb))
+               return blk_get_backing_dev_info(I_BDEV(inode));
+ #endif
+       return sb->s_bdi;
   }
   
- static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits)
   {
-       return bdi_congested(bdi, (1 << BDI_sync_congested) |
-                                 (1 << BDI_async_congested));
- }
+       struct backing_dev_info *bdi = wb->bdi;
   
- enum {
-       BLK_RW_ASYNC    = 0,
-       BLK_RW_SYNC     = 1,
- };
+       if (bdi->congested_fn)
+               return bdi->congested_fn(bdi->congested_data, cong_bits);
+       return wb->congested->state & cong_bits;
+ }
   
- void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
- void set_bdi_congested(struct backing_dev_info *bdi, int sync);
   long congestion_wait(int sync, long timeout);
   long wait_iff_congested(struct zone *zone, int sync, long timeout);
   int pdflush_proc_obsolete(struct ctl_table *table, int write,
@@@ -318,4 -235,333 +234,333 @@@ static inline int bdi_sched_wait(void *
         return 0;
   }
   
- #endif                /* _LINUX_BACKING_DEV_H */
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 
+ struct bdi_writeback_congested *
+ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp);
+ void wb_congested_put(struct bdi_writeback_congested *congested);
+ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+                                   struct cgroup_subsys_state *memcg_css,
+                                   gfp_t gfp);
+ void wb_memcg_offline(struct mem_cgroup *memcg);
+ void wb_blkcg_offline(struct blkcg *blkcg);
+ int inode_congested(struct inode *inode, int cong_bits);
+ 
+ /**
+  * inode_cgwb_enabled - test whether cgroup writeback is enabled on an inode
+  * @inode: inode of interest
+  *
+  * cgroup writeback requires support from both the bdi and filesystem.
+  * Test whether @inode has both.
+  */
+ static inline bool inode_cgwb_enabled(struct inode *inode)
+ {
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+ 
+       return bdi_cap_account_dirty(bdi) &&
+               (bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
+               (inode->i_sb->s_iflags & SB_I_CGROUPWB);
+ }
+ 
+ /**
+  * wb_find_current - find wb for %current on a bdi
+  * @bdi: bdi of interest
+  *
+  * Find the wb of @bdi which matches both the memcg and blkcg of %current.
+  * Must be called under rcu_read_lock() which protects the returend wb.
+  * NULL if not found.
+  */
+ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+ {
+       struct cgroup_subsys_state *memcg_css;
+       struct bdi_writeback *wb;
+ 
+       memcg_css = task_css(current, memory_cgrp_id);
+       if (!memcg_css->parent)
+               return &bdi->wb;
+ 
+       wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ 
+       /*
+        * %current's blkcg equals the effective blkcg of its memcg.  No
+        * need to use the relatively expensive cgroup_get_e_css().
+        */
+       if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+               return wb;
+       return NULL;
+ }
+ 
+ /**
+  * wb_get_create_current - get or create wb for %current on a bdi
+  * @bdi: bdi of interest
+  * @gfp: allocation mask
+  *
+  * Equivalent to wb_get_create() on %current's memcg.  This function is
+  * called from a relatively hot path and optimizes the common cases using
+  * wb_find_current().
+  */
+ static inline struct bdi_writeback *
+ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+ {
+       struct bdi_writeback *wb;
+ 
+       rcu_read_lock();
+       wb = wb_find_current(bdi);
+       if (wb && unlikely(!wb_tryget(wb)))
+               wb = NULL;
+       rcu_read_unlock();
+ 
+       if (unlikely(!wb)) {
+               struct cgroup_subsys_state *memcg_css;
+ 
+               memcg_css = task_get_css(current, memory_cgrp_id);
+               wb = wb_get_create(bdi, memcg_css, gfp);
+               css_put(memcg_css);
+       }
+       return wb;
+ }
+ 
+ /**
+  * inode_to_wb_is_valid - test whether an inode has a wb associated
+  * @inode: inode of interest
+  *
+  * Returns %true if @inode has a wb associated.  May be called without any
+  * locking.
+  */
+ static inline bool inode_to_wb_is_valid(struct inode *inode)
+ {
+       return inode->i_wb;
+ }
+ 
+ /**
+  * inode_to_wb - determine the wb of an inode
+  * @inode: inode of interest
+  *
+  * Returns the wb @inode is currently associated with.  The caller must be
+  * holding either @inode->i_lock, @inode->i_mapping->tree_lock, or the
+  * associated wb's list_lock.
+  */
+ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+ {
+ #ifdef CONFIG_LOCKDEP
+       WARN_ON_ONCE(debug_locks &&
+                    (!lockdep_is_held(&inode->i_lock) &&
+                     !lockdep_is_held(&inode->i_mapping->tree_lock) &&
+                     !lockdep_is_held(&inode->i_wb->list_lock)));
+ #endif
+       return inode->i_wb;
+ }
+ 
+ /**
+  * unlocked_inode_to_wb_begin - begin unlocked inode wb access transaction
+  * @inode: target inode
+  * @lockedp: temp bool output param, to be passed to the end function
+  *
+  * The caller wants to access the wb associated with @inode but isn't
+  * holding inode->i_lock, mapping->tree_lock or wb->list_lock.  This
+  * function determines the wb associated with @inode and ensures that the
+  * association doesn't change until the transaction is finished with
+  * unlocked_inode_to_wb_end().
+  *
+  * The caller must call unlocked_inode_to_wb_end() with *@lockdep
+  * afterwards and can't sleep during transaction.  IRQ may or may not be
+  * disabled on return.
+  */
+ static inline struct bdi_writeback *
+ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+ {
+       rcu_read_lock();
+ 
+       /*
+        * Paired with store_release in inode_switch_wb_work_fn() and
+        * ensures that we see the new wb if we see cleared I_WB_SWITCH.
+        */
+       *lockedp = smp_load_acquire(&inode->i_state) & I_WB_SWITCH;
+ 
+       if (unlikely(*lockedp))
+               spin_lock_irq(&inode->i_mapping->tree_lock);
+ 
+       /*
+        * Protected by either !I_WB_SWITCH + rcu_read_lock() or tree_lock.
+        * inode_to_wb() will bark.  Deref directly.
+        */
+       return inode->i_wb;
+ }
+ 
+ /**
+  * unlocked_inode_to_wb_end - end inode wb access transaction
+  * @inode: target inode
+  * @locked: *@lockedp from unlocked_inode_to_wb_begin()
+  */
+ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+ {
+       if (unlikely(locked))
+               spin_unlock_irq(&inode->i_mapping->tree_lock);
+ 
+       rcu_read_unlock();
+ }
+ 
+ struct wb_iter {
+       int                     start_blkcg_id;
+       struct radix_tree_iter  tree_iter;
+       void                    **slot;
+ };
+ 
+ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
+                                                  struct backing_dev_info *bdi)
+ {
+       struct radix_tree_iter *titer = &iter->tree_iter;
+ 
+       WARN_ON_ONCE(!rcu_read_lock_held());
+ 
+       if (iter->start_blkcg_id >= 0) {
+               iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
+               iter->start_blkcg_id = -1;
+       } else {
+               iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
+       }
+ 
+       if (!iter->slot)
+               iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
+       if (iter->slot)
+               return *iter->slot;
+       return NULL;
+ }
+ 
+ static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
+                                                  struct backing_dev_info *bdi,
+                                                  int start_blkcg_id)
+ {
+       iter->start_blkcg_id = start_blkcg_id;
+ 
+       if (start_blkcg_id)
+               return __wb_iter_next(iter, bdi);
+       else
+               return &bdi->wb;
+ }
+ 
+ /**
+  * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+  * @wb_cur: cursor struct bdi_writeback pointer
+  * @bdi: bdi to walk wb's of
+  * @iter: pointer to struct wb_iter to be used as iteration buffer
+  * @start_blkcg_id: blkcg ID to start iteration from
+  *
+  * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
+  * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+  * to be used as temp storage during iteration.  rcu_read_lock() must be
+  * held throughout iteration.
+  */
+ #define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)            \
+       for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);      \
+            (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
+ 
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ 
+ static inline bool inode_cgwb_enabled(struct inode *inode)
+ {
+       return false;
+ }
+ 
+ static inline struct bdi_writeback_congested *
+ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+ {
+       return bdi->wb.congested;
+ }
+ 
+ static inline void wb_congested_put(struct bdi_writeback_congested *congested)
+ {
+ }
+ 
+ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi)
+ {
+       return &bdi->wb;
+ }
+ 
+ static inline struct bdi_writeback *
+ wb_get_create_current(struct backing_dev_info *bdi, gfp_t gfp)
+ {
+       return &bdi->wb;
+ }
+ 
+ static inline bool inode_to_wb_is_valid(struct inode *inode)
+ {
+       return true;
+ }
+ 
+ static inline struct bdi_writeback *inode_to_wb(struct inode *inode)
+ {
+       return &inode_to_bdi(inode)->wb;
+ }
+ 
+ static inline struct bdi_writeback *
+ unlocked_inode_to_wb_begin(struct inode *inode, bool *lockedp)
+ {
+       return inode_to_wb(inode);
+ }
+ 
+ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
+ {
+ }
+ 
+ static inline void wb_memcg_offline(struct mem_cgroup *memcg)
+ {
+ }
+ 
+ static inline void wb_blkcg_offline(struct blkcg *blkcg)
+ {
+ }
+ 
+ struct wb_iter {
+       int             next_id;
+ };
+ 
+ #define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)            \
+       for ((iter)->next_id = (start_blkcg_id);                        \
+            ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
+ 
+ static inline int inode_congested(struct inode *inode, int cong_bits)
+ {
+       return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
+ }
+ 
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
+ 
+ static inline int inode_read_congested(struct inode *inode)
+ {
+       return inode_congested(inode, 1 << WB_sync_congested);
+ }
+ 
+ static inline int inode_write_congested(struct inode *inode)
+ {
+       return inode_congested(inode, 1 << WB_async_congested);
+ }
+ 
+ static inline int inode_rw_congested(struct inode *inode)
+ {
+       return inode_congested(inode, (1 << WB_sync_congested) |
+                                     (1 << WB_async_congested));
+ }
+ 
+ static inline int bdi_congested(struct backing_dev_info *bdi, int cong_bits)
+ {
+       return wb_congested(&bdi->wb, cong_bits);
+ }
+ 
+ static inline int bdi_read_congested(struct backing_dev_info *bdi)
+ {
+       return bdi_congested(bdi, 1 << WB_sync_congested);
+ }
+ 
+ static inline int bdi_write_congested(struct backing_dev_info *bdi)
+ {
+       return bdi_congested(bdi, 1 << WB_async_congested);
+ }
+ 
+ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
+ {
+       return bdi_congested(bdi, (1 << WB_sync_congested) |
+                                 (1 << WB_async_congested));
+ }
+ 
+ #endif        /* _LINUX_BACKING_DEV_H */
diff --combined include/linux/blk-cgroup.h

index 0000000000000000000000000000000000000000,07a32b813ed897d2610fc4abcc205dc6f5f0f0af..58cfab80dd707ff28d8b4e12fdf735bc24f7f60b

mode 000000,100644..100644
--- /dev/null
--- 2/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@@ -1,0 -1,631 +1,655 @@@
- -/* CFQ specific, out here for blkcg->cfq_weight */
- -#define CFQ_WEIGHT_MIN                10
- -#define CFQ_WEIGHT_MAX                1000
- -#define CFQ_WEIGHT_DEFAULT    500
- -
+ #ifndef _BLK_CGROUP_H
+ #define _BLK_CGROUP_H
+ /*
+  * Common Block IO controller cgroup interface
+  *
+  * Based on ideas and code from CFQ, CFS and BFQ:
+  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+  *
+  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+  *                  Paolo Valente <paolo.valente@unimore.it>
+  *
+  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+  *                  Nauman Rafique <nauman@google.com>
+  */
+ 
+ #include <linux/cgroup.h>
+ #include <linux/u64_stats_sync.h>
+ #include <linux/seq_file.h>
+ #include <linux/radix-tree.h>
+ #include <linux/blkdev.h>
+ #include <linux/atomic.h>
+ 
+ /* Max limits for throttle policy */
+ #define THROTL_IOPS_MAX               UINT_MAX
+ 
- -      /* TODO: per-policy storage in blkcg */
- -      unsigned int                    cfq_weight;     /* belongs to cfq */
- -      unsigned int                    cfq_leaf_weight;
+ #ifdef CONFIG_BLK_CGROUP
+ 
+ enum blkg_rwstat_type {
+       BLKG_RWSTAT_READ,
+       BLKG_RWSTAT_WRITE,
+       BLKG_RWSTAT_SYNC,
+       BLKG_RWSTAT_ASYNC,
+ 
+       BLKG_RWSTAT_NR,
+       BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+ };
+ 
+ struct blkcg_gq;
+ 
+ struct blkcg {
+       struct cgroup_subsys_state      css;
+       spinlock_t                      lock;
+ 
+       struct radix_tree_root          blkg_tree;
+       struct blkcg_gq                 *blkg_hint;
+       struct hlist_head               blkg_list;
+ 
++      struct blkcg_policy_data        *pd[BLKCG_MAX_POLS];
+ 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head                cgwb_list;
+ #endif
+ };
+ 
+ struct blkg_stat {
+       struct u64_stats_sync           syncp;
+       uint64_t                        cnt;
+ };
+ 
+ struct blkg_rwstat {
+       struct u64_stats_sync           syncp;
+       uint64_t                        cnt[BLKG_RWSTAT_NR];
+ };
+ 
+ /*
+  * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+  * request_queue (q).  This is used by blkcg policies which need to track
+  * information per blkcg - q pair.
+  *
+  * There can be multiple active blkcg policies and each has its private
+  * data on each blkg, the size of which is determined by
+  * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
+  * together with blkg and invokes pd_init/exit_fn() methods.
+  *
+  * Such private data must embed struct blkg_policy_data (pd) at the
+  * beginning and pd_size can't be smaller than pd.
+  */
+ struct blkg_policy_data {
+       /* the blkg and policy id this per-policy data belongs to */
+       struct blkcg_gq                 *blkg;
+       int                             plid;
+ 
+       /* used during policy activation */
+       struct list_head                alloc_node;
+ };
+ 
++/*
++ * Policies that need to keep per-blkcg data which is independent
++ * from any request_queue associated to it must specify its size
++ * with the cpd_size field of the blkcg_policy structure and
++ * embed a blkcg_policy_data in it. blkcg core allocates
++ * policy-specific per-blkcg structures lazily the first time
++ * they are actually needed, so it handles them together with
++ * blkgs. cpd_init() is invoked to let each policy handle
++ * per-blkcg data.
++ */
++struct blkcg_policy_data {
++      /* the policy id this per-policy data belongs to */
++      int                             plid;
++
++      /* used during policy activation */
++      struct list_head                alloc_node;
++};
++
+ /* association between a blk cgroup and a request queue */
+ struct blkcg_gq {
+       /* Pointer to the associated request_queue */
+       struct request_queue            *q;
+       struct list_head                q_node;
+       struct hlist_node               blkcg_node;
+       struct blkcg                    *blkcg;
+ 
+       /*
+        * Each blkg gets congested separately and the congestion state is
+        * propagated to the matching bdi_writeback_congested.
+        */
+       struct bdi_writeback_congested  *wb_congested;
+ 
+       /* all non-root blkcg_gq's are guaranteed to have access to parent */
+       struct blkcg_gq                 *parent;
+ 
+       /* request allocation list for this blkcg-q pair */
+       struct request_list             rl;
+ 
+       /* reference count */
+       atomic_t                        refcnt;
+ 
+       /* is this blkg online? protected by both blkcg and q locks */
+       bool                            online;
+ 
+       struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
+ 
+       struct rcu_head                 rcu_head;
+ };
+ 
++typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
+ typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
+ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+ 
+ struct blkcg_policy {
+       int                             plid;
+       /* policy specific private data size */
+       size_t                          pd_size;
++      /* policy specific per-blkcg data size */
++      size_t                          cpd_size;
+       /* cgroup files for the policy */
+       struct cftype                   *cftypes;
+ 
+       /* operations */
++      blkcg_pol_init_cpd_fn           *cpd_init_fn;
+       blkcg_pol_init_pd_fn            *pd_init_fn;
+       blkcg_pol_online_pd_fn          *pd_online_fn;
+       blkcg_pol_offline_pd_fn         *pd_offline_fn;
+       blkcg_pol_exit_pd_fn            *pd_exit_fn;
+       blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
+ };
+ 
+ extern struct blkcg blkcg_root;
+ extern struct cgroup_subsys_state * const blkcg_root_css;
+ 
+ struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+                                   struct request_queue *q);
+ int blkcg_init_queue(struct request_queue *q);
+ void blkcg_drain_queue(struct request_queue *q);
+ void blkcg_exit_queue(struct request_queue *q);
+ 
+ /* Blkio controller policy registration */
+ int blkcg_policy_register(struct blkcg_policy *pol);
+ void blkcg_policy_unregister(struct blkcg_policy *pol);
+ int blkcg_activate_policy(struct request_queue *q,
+                         const struct blkcg_policy *pol);
+ void blkcg_deactivate_policy(struct request_queue *q,
+                            const struct blkcg_policy *pol);
+ 
+ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+                      u64 (*prfill)(struct seq_file *,
+                                    struct blkg_policy_data *, int),
+                      const struct blkcg_policy *pol, int data,
+                      bool show_total);
+ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+                        const struct blkg_rwstat *rwstat);
+ u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
+ u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+                      int off);
+ 
+ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
+ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
+                                            int off);
+ 
+ struct blkg_conf_ctx {
+       struct gendisk                  *disk;
+       struct blkcg_gq                 *blkg;
+       u64                             v;
+ };
+ 
+ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+                  const char *input, struct blkg_conf_ctx *ctx);
+ void blkg_conf_finish(struct blkg_conf_ctx *ctx);
+ 
+ 
+ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+ {
+       return css ? container_of(css, struct blkcg, css) : NULL;
+ }
+ 
+ static inline struct blkcg *task_blkcg(struct task_struct *tsk)
+ {
+       return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+ }
+ 
+ static inline struct blkcg *bio_blkcg(struct bio *bio)
+ {
+       if (bio && bio->bi_css)
+               return css_to_blkcg(bio->bi_css);
+       return task_blkcg(current);
+ }
+ 
+ static inline struct cgroup_subsys_state *
+ task_get_blkcg_css(struct task_struct *task)
+ {
+       return task_get_css(task, blkio_cgrp_id);
+ }
+ 
+ /**
+  * blkcg_parent - get the parent of a blkcg
+  * @blkcg: blkcg of interest
+  *
+  * Return the parent blkcg of @blkcg.  Can be called anytime.
+  */
+ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+ {
+       return css_to_blkcg(blkcg->css.parent);
+ }
+ 
+ /**
+  * blkg_to_pdata - get policy private data
+  * @blkg: blkg of interest
+  * @pol: policy of interest
+  *
+  * Return pointer to private data associated with the @blkg-@pol pair.
+  */
+ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+                                                 struct blkcg_policy *pol)
+ {
+       return blkg ? blkg->pd[pol->plid] : NULL;
+ }
+ 
++static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
++                                                   struct blkcg_policy *pol)
++{
++      return blkcg ? blkcg->pd[pol->plid] : NULL;
++}
++
+ /**
+  * pdata_to_blkg - get blkg associated with policy private data
+  * @pd: policy private data of interest
+  *
+  * @pd is policy private data.  Determine the blkg it's associated with.
+  */
+ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+ {
+       return pd ? pd->blkg : NULL;
+ }
+ 
+ /**
+  * blkg_path - format cgroup path of blkg
+  * @blkg: blkg of interest
+  * @buf: target buffer
+  * @buflen: target buffer length
+  *
+  * Format the path of the cgroup of @blkg into @buf.
+  */
+ static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+ {
+       char *p;
+ 
+       p = cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+       if (!p) {
+               strncpy(buf, "<unavailable>", buflen);
+               return -ENAMETOOLONG;
+       }
+ 
+       memmove(buf, p, buf + buflen - p);
+       return 0;
+ }
+ 
+ /**
+  * blkg_get - get a blkg reference
+  * @blkg: blkg to get
+  *
+  * The caller should be holding an existing reference.
+  */
+ static inline void blkg_get(struct blkcg_gq *blkg)
+ {
+       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+       atomic_inc(&blkg->refcnt);
+ }
+ 
+ void __blkg_release_rcu(struct rcu_head *rcu);
+ 
+ /**
+  * blkg_put - put a blkg reference
+  * @blkg: blkg to put
+  */
+ static inline void blkg_put(struct blkcg_gq *blkg)
+ {
+       WARN_ON_ONCE(atomic_read(&blkg->refcnt) <= 0);
+       if (atomic_dec_and_test(&blkg->refcnt))
+               call_rcu(&blkg->rcu_head, __blkg_release_rcu);
+ }
+ 
+ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
+                              bool update_hint);
+ 
+ /**
+  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+  * @d_blkg: loop cursor pointing to the current descendant
+  * @pos_css: used for iteration
+  * @p_blkg: target blkg to walk descendants of
+  *
+  * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
+  * read locked.  If called under either blkcg or queue lock, the iteration
+  * is guaranteed to include all and only online blkgs.  The caller may
+  * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+  * @p_blkg is included in the iteration and the first node to be visited.
+  */
+ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)         \
+       css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)   \
+               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
+                                             (p_blkg)->q, false)))
+ 
+ /**
+  * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+  * @d_blkg: loop cursor pointing to the current descendant
+  * @pos_css: used for iteration
+  * @p_blkg: target blkg to walk descendants of
+  *
+  * Similar to blkg_for_each_descendant_pre() but performs post-order
+  * traversal instead.  Synchronization rules are the same.  @p_blkg is
+  * included in the iteration and the last node to be visited.
+  */
+ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)                \
+       css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)  \
+               if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
+                                             (p_blkg)->q, false)))
+ 
+ /**
+  * blk_get_rl - get request_list to use
+  * @q: request_queue of interest
+  * @bio: bio which will be attached to the allocated request (may be %NULL)
+  *
+  * The caller wants to allocate a request from @q to use for @bio.  Find
+  * the request_list to use and obtain a reference on it.  Should be called
+  * under queue_lock.  This function is guaranteed to return non-%NULL
+  * request_list.
+  */
+ static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio)
+ {
+       struct blkcg *blkcg;
+       struct blkcg_gq *blkg;
+ 
+       rcu_read_lock();
+ 
+       blkcg = bio_blkcg(bio);
+ 
+       /* bypass blkg lookup and use @q->root_rl directly for root */
+       if (blkcg == &blkcg_root)
+               goto root_rl;
+ 
+       /*
+        * Try to use blkg->rl.  blkg lookup may fail under memory pressure
+        * or if either the blkcg or queue is going away.  Fall back to
+        * root_rl in such cases.
+        */
+       blkg = blkg_lookup_create(blkcg, q);
+       if (unlikely(IS_ERR(blkg)))
+               goto root_rl;
+ 
+       blkg_get(blkg);
+       rcu_read_unlock();
+       return &blkg->rl;
+ root_rl:
+       rcu_read_unlock();
+       return &q->root_rl;
+ }
+ 
+ /**
+  * blk_put_rl - put request_list
+  * @rl: request_list to put
+  *
+  * Put the reference acquired by blk_get_rl().  Should be called under
+  * queue_lock.
+  */
+ static inline void blk_put_rl(struct request_list *rl)
+ {
+       /* root_rl may not have blkg set */
+       if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+               blkg_put(rl->blkg);
+ }
+ 
+ /**
+  * blk_rq_set_rl - associate a request with a request_list
+  * @rq: request of interest
+  * @rl: target request_list
+  *
+  * Associate @rq with @rl so that accounting and freeing can know the
+  * request_list @rq came from.
+  */
+ static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl)
+ {
+       rq->rl = rl;
+ }
+ 
+ /**
+  * blk_rq_rl - return the request_list a request came from
+  * @rq: request of interest
+  *
+  * Return the request_list @rq is allocated from.
+  */
+ static inline struct request_list *blk_rq_rl(struct request *rq)
+ {
+       return rq->rl;
+ }
+ 
+ struct request_list *__blk_queue_next_rl(struct request_list *rl,
+                                        struct request_queue *q);
+ /**
+  * blk_queue_for_each_rl - iterate through all request_lists of a request_queue
+  *
+  * Should be used under queue_lock.
+  */
+ #define blk_queue_for_each_rl(rl, q)  \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
+ 
+ static inline void blkg_stat_init(struct blkg_stat *stat)
+ {
+       u64_stats_init(&stat->syncp);
+ }
+ 
+ /**
+  * blkg_stat_add - add a value to a blkg_stat
+  * @stat: target blkg_stat
+  * @val: value to add
+  *
+  * Add @val to @stat.  The caller is responsible for synchronizing calls to
+  * this function.
+  */
+ static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
+ {
+       u64_stats_update_begin(&stat->syncp);
+       stat->cnt += val;
+       u64_stats_update_end(&stat->syncp);
+ }
+ 
+ /**
+  * blkg_stat_read - read the current value of a blkg_stat
+  * @stat: blkg_stat to read
+  *
+  * Read the current value of @stat.  This function can be called without
+  * synchroniztion and takes care of u64 atomicity.
+  */
+ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
+ {
+       unsigned int start;
+       uint64_t v;
+ 
+       do {
+               start = u64_stats_fetch_begin_irq(&stat->syncp);
+               v = stat->cnt;
+       } while (u64_stats_fetch_retry_irq(&stat->syncp, start));
+ 
+       return v;
+ }
+ 
+ /**
+  * blkg_stat_reset - reset a blkg_stat
+  * @stat: blkg_stat to reset
+  */
+ static inline void blkg_stat_reset(struct blkg_stat *stat)
+ {
+       stat->cnt = 0;
+ }
+ 
+ /**
+  * blkg_stat_merge - merge a blkg_stat into another
+  * @to: the destination blkg_stat
+  * @from: the source
+  *
+  * Add @from's count to @to.
+  */
+ static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+ {
+       blkg_stat_add(to, blkg_stat_read(from));
+ }
+ 
+ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+ {
+       u64_stats_init(&rwstat->syncp);
+ }
+ 
+ /**
+  * blkg_rwstat_add - add a value to a blkg_rwstat
+  * @rwstat: target blkg_rwstat
+  * @rw: mask of REQ_{WRITE|SYNC}
+  * @val: value to add
+  *
+  * Add @val to @rwstat.  The counters are chosen according to @rw.  The
+  * caller is responsible for synchronizing calls to this function.
+  */
+ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+                                  int rw, uint64_t val)
+ {
+       u64_stats_update_begin(&rwstat->syncp);
+ 
+       if (rw & REQ_WRITE)
+               rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+       else
+               rwstat->cnt[BLKG_RWSTAT_READ] += val;
+       if (rw & REQ_SYNC)
+               rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+       else
+               rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+ 
+       u64_stats_update_end(&rwstat->syncp);
+ }
+ 
+ /**
+  * blkg_rwstat_read - read the current values of a blkg_rwstat
+  * @rwstat: blkg_rwstat to read
+  *
+  * Read the current snapshot of @rwstat and return it as the return value.
+  * This function can be called without synchronization and takes care of
+  * u64 atomicity.
+  */
+ static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
+ {
+       unsigned int start;
+       struct blkg_rwstat tmp;
+ 
+       do {
+               start = u64_stats_fetch_begin_irq(&rwstat->syncp);
+               tmp = *rwstat;
+       } while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+ 
+       return tmp;
+ }
+ 
+ /**
+  * blkg_rwstat_total - read the total count of a blkg_rwstat
+  * @rwstat: blkg_rwstat to read
+  *
+  * Return the total count of @rwstat regardless of the IO direction.  This
+  * function can be called without synchronization and takes care of u64
+  * atomicity.
+  */
+ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+ {
+       struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
+ 
+       return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+ }
+ 
+ /**
+  * blkg_rwstat_reset - reset a blkg_rwstat
+  * @rwstat: blkg_rwstat to reset
+  */
+ static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+ {
+       memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+ }
+ 
+ /**
+  * blkg_rwstat_merge - merge a blkg_rwstat into another
+  * @to: the destination blkg_rwstat
+  * @from: the source
+  *
+  * Add @from's counts to @to.
+  */
+ static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
+                                    struct blkg_rwstat *from)
+ {
+       struct blkg_rwstat v = blkg_rwstat_read(from);
+       int i;
+ 
+       u64_stats_update_begin(&to->syncp);
+       for (i = 0; i < BLKG_RWSTAT_NR; i++)
+               to->cnt[i] += v.cnt[i];
+       u64_stats_update_end(&to->syncp);
+ }
+ 
+ #else /* CONFIG_BLK_CGROUP */
+ 
+ struct blkcg {
+ };
+ 
+ struct blkg_policy_data {
+ };
+ 
++struct blkcg_policy_data {
++};
++
+ struct blkcg_gq {
+ };
+ 
+ struct blkcg_policy {
+ };
+ 
+ #define blkcg_root_css        ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+ 
+ static inline struct cgroup_subsys_state *
+ task_get_blkcg_css(struct task_struct *task)
+ {
+       return NULL;
+ }
+ 
+ #ifdef CONFIG_BLOCK
+ 
+ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+ static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
+ static inline void blkcg_drain_queue(struct request_queue *q) { }
+ static inline void blkcg_exit_queue(struct request_queue *q) { }
+ static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+ static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+ static inline int blkcg_activate_policy(struct request_queue *q,
+                                       const struct blkcg_policy *pol) { return 0; }
+ static inline void blkcg_deactivate_policy(struct request_queue *q,
+                                          const struct blkcg_policy *pol) { }
+ 
+ static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
+ 
+ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+                                                 struct blkcg_policy *pol) { return NULL; }
+ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+ static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+ static inline void blkg_get(struct blkcg_gq *blkg) { }
+ static inline void blkg_put(struct blkcg_gq *blkg) { }
+ 
+ static inline struct request_list *blk_get_rl(struct request_queue *q,
+                                             struct bio *bio) { return &q->root_rl; }
+ static inline void blk_put_rl(struct request_list *rl) { }
+ static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
+ static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
+ 
+ #define blk_queue_for_each_rl(rl, q)  \
+       for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+ 
+ #endif        /* CONFIG_BLOCK */
+ #endif        /* CONFIG_BLK_CGROUP */
+ #endif        /* _BLK_CGROUP_H */
diff --combined include/linux/blkdev.h

index 5ced29cef03f7b01819019e7c34cbbc1b2b549a5,ab4a27852f1bdaf836825def801e727de3e8fe50..7f2f54b4587f78e17536c9197fe8172b566f8802
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -12,7 -12,7 +12,7 @@@
   #include <linux/timer.h>
   #include <linux/workqueue.h>
   #include <linux/pagemap.h>
- #include <linux/backing-dev.h>
+ #include <linux/backing-dev-defs.h>
   #include <linux/wait.h>
   #include <linux/mempool.h>
   #include <linux/bio.h>
@@@ -22,7 -22,8 +22,7 @@@
   #include <linux/smp.h>
   #include <linux/rcupdate.h>
   #include <linux/percpu-refcount.h>
- -
- -#include <asm/scatterlist.h>
+ +#include <linux/scatterlist.h>
   
   struct module;
   struct scsi_ioctl_command;
@@@ -787,25 -788,8 +787,6 @@@ extern int scsi_cmd_ioctl(struct reques
   extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
                          struct scsi_ioctl_command __user *);
   
- /*
-  * A queue has just exitted congestion.  Note this in the global counter of
-  * congested queues, and wake up anyone who was waiting for requests to be
-  * put back.
-  */
- static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
- {
-       clear_bdi_congested(&q->backing_dev_info, sync);
- }
- 
- /*
-  * A queue has just entered congestion.  Flag that in the queue's VM-visible
-  * state flags and increment the global gounter of congested queues.
-  */
- static inline void blk_set_queue_congested(struct request_queue *q, int sync)
- {
-       set_bdi_congested(&q->backing_dev_info, sync);
- }
- -extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
--
   extern void blk_start_queue(struct request_queue *q);
   extern void blk_stop_queue(struct request_queue *q);
   extern void blk_sync_queue(struct request_queue *q);
@@@ -1021,7 -1005,6 +1002,7 @@@ bool __must_check blk_get_queue(struct 
   struct request_queue *blk_alloc_queue(gfp_t);
   struct request_queue *blk_alloc_queue_node(gfp_t, int);
   extern void blk_put_queue(struct request_queue *);
+ +extern void blk_set_queue_dying(struct request_queue *);
   
   /*
    * block layer runtime pm functions
diff --combined include/linux/fs.h

index 5db7b1379d174848116124b5f9c26fc212798d21,2c5e33a5b2af4af5934597b082452858d7717f28..e351da4a934f415b4ba0d2cc84acaa52c8120614
--- 1/include/linux/fs.h
--- 2/include/linux/fs.h
+++ b/include/linux/fs.h
@@@ -35,9 -35,11 +35,10 @@@
   #include <uapi/linux/fs.h>
   
   struct backing_dev_info;
+ struct bdi_writeback;
   struct export_operations;
   struct hd_geometry;
   struct iovec;
- -struct nameidata;
   struct kiocb;
   struct kobject;
   struct pipe_inode_info;
@@@ -634,6 -636,14 +635,14 @@@ struct inode 
   
         struct hlist_node       i_hash;
         struct list_head        i_wb_list;      /* backing dev IO list */
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct bdi_writeback    *i_wb;          /* the associated cgroup wb */
+ 
+       /* foreign inode detection, see wbc_detach_inode() */
+       int                     i_wb_frn_winner;
+       u16                     i_wb_frn_avg_time;
+       u16                     i_wb_frn_history;
+ #endif
         struct list_head        i_lru;          /* inode LRU list */
         struct list_head        i_sb_list;
         union {
@@@ -655,7 -665,6 +664,7 @@@
                 struct pipe_inode_info  *i_pipe;
                 struct block_device     *i_bdev;
                 struct cdev             *i_cdev;
+ +              char                    *i_link;
         };
   
         __u32                   i_generation;
@@@ -1232,6 -1241,8 +1241,8 @@@ struct mm_struct
   #define UMOUNT_NOFOLLOW       0x00000008      /* Don't follow symlink on umount */
   #define UMOUNT_UNUSED 0x80000000      /* Flag guaranteed to be unused */
   
+ /* sb->s_iflags */
+ #define SB_I_CGROUPWB 0x00000001      /* cgroup-aware writeback enabled */
   
   /* Possible states of 'frozen' field */
   enum {
@@@ -1270,6 -1281,7 +1281,7 @@@ struct super_block 
         const struct quotactl_ops       *s_qcop;
         const struct export_operations *s_export_op;
         unsigned long           s_flags;
+       unsigned long           s_iflags;       /* internal SB_I_* flags */
         unsigned long           s_magic;
         struct dentry           *s_root;
         struct rw_semaphore     s_umount;
@@@ -1607,12 -1619,12 +1619,12 @@@ struct file_operations 
   
   struct inode_operations {
         struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
- -      void * (*follow_link) (struct dentry *, struct nameidata *);
+ +      const char * (*follow_link) (struct dentry *, void **);
         int (*permission) (struct inode *, int);
         struct posix_acl * (*get_acl)(struct inode *, int);
   
         int (*readlink) (struct dentry *, char __user *,int);
- -      void (*put_link) (struct dentry *, struct nameidata *, void *);
+ +      void (*put_link) (struct inode *, void *);
   
         int (*create) (struct inode *,struct dentry *, umode_t, bool);
         int (*link) (struct dentry *,struct inode *,struct dentry *);
@@@ -1806,6 -1818,11 +1818,11 @@@ struct super_operations 
    *
    * I_DIO_WAKEUP               Never set.  Only used as a key for wait_on_bit().
    *
+  * I_WB_SWITCH                Cgroup bdi_writeback switching in progress.  Used to
+  *                    synchronize competing switching instances and to tell
+  *                    wb stat updates to grab mapping->tree_lock.  See
+  *                    inode_switch_wb_work_fn() for details.
+  *
    * Q: What is the difference between I_WILL_FREE and I_FREEING?
    */
   #define I_DIRTY_SYNC          (1 << 0)
@@@ -1825,6 -1842,7 +1842,7 @@@
   #define I_DIRTY_TIME          (1 << 11)
   #define __I_DIRTY_TIME_EXPIRED        12
   #define I_DIRTY_TIME_EXPIRED  (1 << __I_DIRTY_TIME_EXPIRED)
+ #define I_WB_SWITCH           (1 << 13)
   
   #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
   #define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
@@@ -1879,7 -1897,6 +1897,7 @@@ enum file_time_flags 
         S_VERSION = 8,
   };
   
+ +extern bool atime_needs_update(const struct path *, struct inode *);
   extern void touch_atime(const struct path *);
   static inline void file_accessed(struct file *file)
   {
@@@ -2241,7 -2258,13 +2259,13 @@@ extern struct super_block *freeze_bdev(
   extern void emergency_thaw_all(void);
   extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
   extern int fsync_bdev(struct block_device *);
- extern int sb_is_blkdev_sb(struct super_block *sb);
+ 
+ extern struct super_block *blockdev_superblock;
+ 
+ static inline bool sb_is_blkdev_sb(struct super_block *sb)
+ {
+       return sb == blockdev_superblock;
+ }
   #else
   static inline void bd_forget(struct inode *inode) {}
   static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@@ -2708,14 -2731,13 +2732,14 @@@ extern const struct file_operations gen
   
   extern int readlink_copy(char __user *, int, const char *);
   extern int page_readlink(struct dentry *, char __user *, int);
- -extern void *page_follow_link_light(struct dentry *, struct nameidata *);
- -extern void page_put_link(struct dentry *, struct nameidata *, void *);
+ +extern const char *page_follow_link_light(struct dentry *, void **);
+ +extern void page_put_link(struct inode *, void *);
   extern int __page_symlink(struct inode *inode, const char *symname, int len,
                 int nofs);
   extern int page_symlink(struct inode *inode, const char *symname, int len);
   extern const struct inode_operations page_symlink_inode_operations;
- -extern void kfree_put_link(struct dentry *, struct nameidata *, void *);
+ +extern void kfree_put_link(struct inode *, void *);
+ +extern void free_page_put_link(struct inode *, void *);
   extern int generic_readlink(struct dentry *, char __user *, int);
   extern void generic_fillattr(struct inode *, struct kstat *);
   int vfs_getattr_nosec(struct path *path, struct kstat *stat);
@@@ -2726,8 -2748,6 +2750,8 @@@ void __inode_sub_bytes(struct inode *in
   void inode_sub_bytes(struct inode *inode, loff_t bytes);
   loff_t inode_get_bytes(struct inode *inode);
   void inode_set_bytes(struct inode *inode, loff_t bytes);
+ +const char *simple_follow_link(struct dentry *, void **);
+ +extern const struct inode_operations simple_symlink_inode_operations;
   
   extern int iterate_dir(struct file *, struct dir_context *);
   
diff --combined include/linux/memcontrol.h

index 6c8918114804fda89d00ed3e6b1482539f2dd4ee,c3eb19e2bc1c43da6d4738fb2bf2daf78e271517..73b02b0a8f609ac757de6ee59b23bcf8b0e87396
--- 1/include/linux/memcontrol.h
--- 2/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@@ -41,6 -41,7 +41,7 @@@ enum mem_cgroup_stat_index 
         MEM_CGROUP_STAT_RSS,            /* # of pages charged as anon rss */
         MEM_CGROUP_STAT_RSS_HUGE,       /* # of pages charged as anon huge */
         MEM_CGROUP_STAT_FILE_MAPPED,    /* # of pages charged as file rss */
+       MEM_CGROUP_STAT_DIRTY,          /* # of dirty pages in page cache */
         MEM_CGROUP_STAT_WRITEBACK,      /* # of pages under writeback */
         MEM_CGROUP_STAT_SWAP,           /* # of pages, swapped out */
         MEM_CGROUP_STAT_NSTATS,
@@@ -67,6 -68,8 +68,8 @@@ enum mem_cgroup_events_index 
   };
   
   #ifdef CONFIG_MEMCG
+ extern struct cgroup_subsys_state *mem_cgroup_root_css;
+ 
   void mem_cgroup_events(struct mem_cgroup *memcg,
                        enum mem_cgroup_events_index idx,
                        unsigned int nr);
@@@ -112,6 -115,7 +115,7 @@@ static inline bool mm_match_cgroup(stru
   }
   
   extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
+ extern struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page);
   
   struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
                                    struct mem_cgroup *,
@@@ -195,6 -199,8 +199,8 @@@ void mem_cgroup_split_huge_fixup(struc
   #else /* CONFIG_MEMCG */
   struct mem_cgroup;
   
+ #define mem_cgroup_root_css ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
+ 
   static inline void mem_cgroup_events(struct mem_cgroup *memcg,
                                      enum mem_cgroup_events_index idx,
                                      unsigned int nr)
@@@ -382,6 -388,29 +388,29 @@@ enum 
         OVER_LIMIT,
   };
   
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 
+ struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg);
+ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb);
+ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback);
+ 
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ 
+ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+ {
+       return NULL;
+ }
+ 
+ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
+                                      unsigned long *pavail,
+                                      unsigned long *pdirty,
+                                      unsigned long *pwriteback)
+ {
+ }
+ 
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
+ 
   struct sock;
   #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
   void sock_update_memcg(struct sock *sk);
@@@ -463,8 -492,6 +492,8 @@@ memcg_kmem_newpage_charge(gfp_t gfp, st
         if (!memcg_kmem_enabled())
                 return true;
   
+ +      if (gfp & __GFP_NOACCOUNT)
+ +              return true;
         /*
          * __GFP_NOFAIL allocations will move on even if charging is not
          * possible. Therefore we don't even try, and have this allocation
@@@ -524,8 -551,6 +553,8 @@@ memcg_kmem_get_cache(struct kmem_cache 
   {
         if (!memcg_kmem_enabled())
                 return cachep;
+ +      if (gfp & __GFP_NOACCOUNT)
+ +              return cachep;
         if (gfp & __GFP_NOFAIL)
                 return cachep;
         if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
diff --combined include/linux/mm.h

index 24ad583596d1219b4ec1111e5aea3045230ee650,4024543b4203eb86d29ef8b162289efd1a775f7a..99959a34f4f15e6d66b8a6681b256134634164ee
--- 1/include/linux/mm.h
--- 2/include/linux/mm.h
+++ b/include/linux/mm.h
@@@ -27,6 -27,7 +27,7 @@@ struct anon_vma_chain
   struct file_ra_state;
   struct user_struct;
   struct writeback_control;
+ struct bdi_writeback;
   
   #ifndef CONFIG_NEED_MULTIPLE_NODES    /* Don't use mapnrs, do it properly */
   extern unsigned long max_mapnr;
@@@ -499,7 -500,7 +500,7 @@@ static inline int page_count(struct pag
   
   static inline bool __compound_tail_refcounted(struct page *page)
   {
- -      return !PageSlab(page) && !PageHeadHuge(page);
+ +      return PageAnon(page) && !PageSlab(page) && !PageHeadHuge(page);
   }
   
   /*
@@@ -1211,10 -1212,13 +1212,13 @@@ int __set_page_dirty_nobuffers(struct p
   int __set_page_dirty_no_writeback(struct page *page);
   int redirty_page_for_writepage(struct writeback_control *wbc,
                                 struct page *page);
- void account_page_dirtied(struct page *page, struct address_space *mapping);
- void account_page_cleaned(struct page *page, struct address_space *mapping);
+ void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg);
+ void account_page_cleaned(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg, struct bdi_writeback *wb);
   int set_page_dirty(struct page *page);
   int set_page_dirty_lock(struct page *page);
+ void cancel_dirty_page(struct page *page);
   int clear_page_dirty_for_io(struct page *page);
   
   int get_cmdline(struct task_struct *task, char *buffer, int buflen);
@@@ -2146,47 -2150,12 +2150,47 @@@ enum mf_flags 
   extern int memory_failure(unsigned long pfn, int trapno, int flags);
   extern void memory_failure_queue(unsigned long pfn, int trapno, int flags);
   extern int unpoison_memory(unsigned long pfn);
+ +extern int get_hwpoison_page(struct page *page);
   extern int sysctl_memory_failure_early_kill;
   extern int sysctl_memory_failure_recovery;
   extern void shake_page(struct page *p, int access);
   extern atomic_long_t num_poisoned_pages;
   extern int soft_offline_page(struct page *page, int flags);
   
+ +
+ +/*
+ + * Error handlers for various types of pages.
+ + */
+ +enum mf_result {
+ +      MF_IGNORED,     /* Error: cannot be handled */
+ +      MF_FAILED,      /* Error: handling failed */
+ +      MF_DELAYED,     /* Will be handled later */
+ +      MF_RECOVERED,   /* Successfully recovered */
+ +};
+ +
+ +enum mf_action_page_type {
+ +      MF_MSG_KERNEL,
+ +      MF_MSG_KERNEL_HIGH_ORDER,
+ +      MF_MSG_SLAB,
+ +      MF_MSG_DIFFERENT_COMPOUND,
+ +      MF_MSG_POISONED_HUGE,
+ +      MF_MSG_HUGE,
+ +      MF_MSG_FREE_HUGE,
+ +      MF_MSG_UNMAP_FAILED,
+ +      MF_MSG_DIRTY_SWAPCACHE,
+ +      MF_MSG_CLEAN_SWAPCACHE,
+ +      MF_MSG_DIRTY_MLOCKED_LRU,
+ +      MF_MSG_CLEAN_MLOCKED_LRU,
+ +      MF_MSG_DIRTY_UNEVICTABLE_LRU,
+ +      MF_MSG_CLEAN_UNEVICTABLE_LRU,
+ +      MF_MSG_DIRTY_LRU,
+ +      MF_MSG_CLEAN_LRU,
+ +      MF_MSG_TRUNCATED_LRU,
+ +      MF_MSG_BUDDY,
+ +      MF_MSG_BUDDY_2ND,
+ +      MF_MSG_UNKNOWN,
+ +};
+ +
   #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS)
   extern void clear_huge_page(struct page *page,
                             unsigned long addr,
diff --combined include/trace/events/writeback.h

index c178d13d6f4c0cb51d441c59e7b4975a1913ed3e,bec69995968f2f0d4a0025f386e66c6a4699ec47..a7aa607a4c55e51ec8ba8a6593828604e31a7aac
--- 1/include/trace/events/writeback.h
--- 2/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@@ -250,6 -250,7 +250,6 @@@ DEFINE_EVENT(writeback_class, name, 
   DEFINE_WRITEBACK_EVENT(writeback_nowork);
   DEFINE_WRITEBACK_EVENT(writeback_wake_background);
   DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
- -DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
   
   DECLARE_EVENT_CLASS(wbc_class,
         TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@@ -360,7 -361,7 +360,7 @@@ TRACE_EVENT(global_dirty_state
                 __entry->nr_written     = global_page_state(NR_WRITTEN);
                 __entry->background_thresh = background_thresh;
                 __entry->dirty_thresh   = dirty_thresh;
-               __entry->dirty_limit = global_dirty_limit;
+               __entry->dirty_limit    = global_wb_domain.dirty_limit;
         ),
   
         TP_printk("dirty=%lu writeback=%lu unstable=%lu "
@@@ -399,13 -400,13 +399,13 @@@ TRACE_EVENT(bdi_dirty_ratelimit
   
         TP_fast_assign(
                 strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-               __entry->write_bw       = KBps(bdi->write_bandwidth);
-               __entry->avg_write_bw   = KBps(bdi->avg_write_bandwidth);
+               __entry->write_bw       = KBps(bdi->wb.write_bandwidth);
+               __entry->avg_write_bw   = KBps(bdi->wb.avg_write_bandwidth);
                 __entry->dirty_rate     = KBps(dirty_rate);
-               __entry->dirty_ratelimit = KBps(bdi->dirty_ratelimit);
+               __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
                 __entry->task_ratelimit = KBps(task_ratelimit);
                 __entry->balanced_dirty_ratelimit =
-                                         KBps(bdi->balanced_dirty_ratelimit);
+                                       KBps(bdi->wb.balanced_dirty_ratelimit);
         ),
   
         TP_printk("bdi %s: "
@@@ -462,8 -463,9 +462,9 @@@ TRACE_EVENT(balance_dirty_pages
                 unsigned long freerun = (thresh + bg_thresh) / 2;
                 strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
   
-               __entry->limit          = global_dirty_limit;
-               __entry->setpoint       = (global_dirty_limit + freerun) / 2;
+               __entry->limit          = global_wb_domain.dirty_limit;
+               __entry->setpoint       = (global_wb_domain.dirty_limit +
+                                               freerun) / 2;
                 __entry->dirty          = dirty;
                 __entry->bdi_setpoint   = __entry->setpoint *
                                                 bdi_thresh / (thresh + 1);
diff --combined init/Kconfig

index b999fa381bf9fe1f37757af5e0a454cc6adb2da9,d4f763332f9f4a2c2417709194ea60026b37b4db..7260b27ebbabeb4537bc747cc604858e070baf00
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -465,9 -465,13 +465,9 @@@ endmenu # "CPU/Task time and stats acco
   
   menu "RCU Subsystem"
   
- -choice
- -      prompt "RCU Implementation"
- -      default TREE_RCU
- -
   config TREE_RCU
- -      bool "Tree-based hierarchical RCU"
- -      depends on !PREEMPT && SMP
+ +      bool
+ +      default y if !PREEMPT && SMP
         help
           This option selects the RCU implementation that is
           designed for very large SMP system with hundreds or
@@@ -475,8 -479,8 +475,8 @@@
           smaller systems.
   
   config PREEMPT_RCU
- -      bool "Preemptible tree-based hierarchical RCU"
- -      depends on PREEMPT
+ +      bool
+ +      default y if PREEMPT
         help
           This option selects the RCU implementation that is
           designed for very large SMP systems with hundreds or
@@@ -487,28 -491,15 +487,28 @@@
           Select this option if you are unsure.
   
   config TINY_RCU
- -      bool "UP-only small-memory-footprint RCU"
- -      depends on !PREEMPT && !SMP
+ +      bool
+ +      default y if !PREEMPT && !SMP
         help
           This option selects the RCU implementation that is
           designed for UP systems from which real-time response
           is not required.  This option greatly reduces the
           memory footprint of RCU.
   
- -endchoice
+ +config RCU_EXPERT
+ +      bool "Make expert-level adjustments to RCU configuration"
+ +      default n
+ +      help
+ +        This option needs to be enabled if you wish to make
+ +        expert-level adjustments to RCU configuration.  By default,
+ +        no such adjustments can be made, which has the often-beneficial
+ +        side-effect of preventing "make oldconfig" from asking you all
+ +        sorts of detailed questions about how you would like numerous
+ +        obscure RCU options to be set up.
+ +
+ +        Say Y if you need to make expert-level adjustments to RCU.
+ +
+ +        Say N if you are unsure.
   
   config SRCU
         bool
@@@ -518,7 -509,7 +518,7 @@@
           sections.
   
   config TASKS_RCU
- -      bool "Task_based RCU implementation using voluntary context switch"
+ +      bool
         default n
         select SRCU
         help
@@@ -526,6 -517,8 +526,6 @@@
           only voluntary context switch (not preemption!), idle, and
           user-mode execution as quiescent states.
   
- -        If unsure, say N.
- -
   config RCU_STALL_COMMON
         def_bool ( TREE_RCU || PREEMPT_RCU || RCU_TRACE )
         help
@@@ -538,7 -531,9 +538,7 @@@ config CONTEXT_TRACKIN
          bool
   
   config RCU_USER_QS
- -      bool "Consider userspace as in RCU extended quiescent state"
- -      depends on HAVE_CONTEXT_TRACKING && SMP
- -      select CONTEXT_TRACKING
+ +      bool
         help
           This option sets hooks on kernel / userspace boundaries and
           puts RCU in extended quiescent state when the CPU runs in
@@@ -546,6 -541,12 +546,6 @@@
           excluded from the global RCU state machine and thus doesn't
           try to keep the timer tick on for RCU.
   
- -        Unless you want to hack and help the development of the full
- -        dynticks mode, you shouldn't enable this option.  It also
- -        adds unnecessary overhead.
- -
- -        If unsure say N
- -
   config CONTEXT_TRACKING_FORCE
         bool "Force context tracking"
         depends on CONTEXT_TRACKING
@@@ -577,7 -578,7 +577,7 @@@ config RCU_FANOU
         int "Tree-based hierarchical RCU fanout value"
         range 2 64 if 64BIT
         range 2 32 if !64BIT
- -      depends on TREE_RCU || PREEMPT_RCU
+ +      depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
         default 64 if 64BIT
         default 32 if !64BIT
         help
@@@ -595,9 -596,9 +595,9 @@@
   
   config RCU_FANOUT_LEAF
         int "Tree-based hierarchical RCU leaf-level fanout value"
- -      range 2 RCU_FANOUT if 64BIT
- -      range 2 RCU_FANOUT if !64BIT
- -      depends on TREE_RCU || PREEMPT_RCU
+ +      range 2 64 if 64BIT
+ +      range 2 32 if !64BIT
+ +      depends on (TREE_RCU || PREEMPT_RCU) && RCU_EXPERT
         default 16
         help
           This option controls the leaf-level fanout of hierarchical
@@@ -620,9 -621,23 +620,9 @@@
   
           Take the default if unsure.
   
- -config RCU_FANOUT_EXACT
- -      bool "Disable tree-based hierarchical RCU auto-balancing"
- -      depends on TREE_RCU || PREEMPT_RCU
- -      default n
- -      help
- -        This option forces use of the exact RCU_FANOUT value specified,
- -        regardless of imbalances in the hierarchy.  This is useful for
- -        testing RCU itself, and might one day be useful on systems with
- -        strong NUMA behavior.
- -
- -        Without RCU_FANOUT_EXACT, the code will balance the hierarchy.
- -
- -        Say N if unsure.
- -
   config RCU_FAST_NO_HZ
         bool "Accelerate last non-dyntick-idle CPU's grace periods"
- -      depends on NO_HZ_COMMON && SMP
+ +      depends on NO_HZ_COMMON && SMP && RCU_EXPERT
         default n
         help
           This option permits CPUs to enter dynticks-idle state even if
@@@ -648,7 -663,7 +648,7 @@@ config TREE_RCU_TRAC
   
   config RCU_BOOST
         bool "Enable RCU priority boosting"
- -      depends on RT_MUTEXES && PREEMPT_RCU
+ +      depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
         default n
         help
           This option boosts the priority of preempted RCU readers that
@@@ -665,7 -680,6 +665,7 @@@ config RCU_KTHREAD_PRI
         range 0 99 if !RCU_BOOST
         default 1 if RCU_BOOST
         default 0 if !RCU_BOOST
+ +      depends on RCU_EXPERT
         help
           This option specifies the SCHED_FIFO priority value that will be
           assigned to the rcuc/n and rcub/n threads and is also the value
@@@ -1127,6 -1141,11 +1127,11 @@@ config DEBUG_BLK_CGROU
         Enable some debugging help. Currently it exports additional stat
         files in a cgroup which can be useful for debugging.
   
+ config CGROUP_WRITEBACK
+       bool
+       depends on MEMCG && BLK_CGROUP
+       default y
+ 
   endif # CGROUPS
   
   config CHECKPOINT_RESTORE
@@@ -1623,7 -1642,7 +1628,7 @@@ config PERF_EVENT
   config DEBUG_PERF_USE_VMALLOC
         default n
         bool "Debug: use vmalloc to back perf mmap() buffers"
- -      depends on PERF_EVENTS && DEBUG_KERNEL
+ +      depends on PERF_EVENTS && DEBUG_KERNEL && !PPC
         select PERF_USE_VMALLOC
         help
          Use vmalloc memory to back perf mmap() buffers.
diff --combined mm/backing-dev.c

index 000e7b3b9896f2a9479687befd2442c43193614e,436bb53dd383f24380b11a44d6f3e39c42c39fbb..7756da31b02bcbb2a7f7036a4bbbdd093883ad6c
--- 1/mm/backing-dev.c
--- 2/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@@ -18,6 -18,7 +18,7 @@@ struct backing_dev_info noop_backing_de
         .name           = "noop",
         .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
   };
+ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
   
   static struct class *bdi_class;
   
@@@ -48,7 -49,7 +49,7 @@@ static int bdi_debug_stats_show(struct 
         struct bdi_writeback *wb = &bdi->wb;
         unsigned long background_thresh;
         unsigned long dirty_thresh;
-       unsigned long bdi_thresh;
+       unsigned long wb_thresh;
         unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
         struct inode *inode;
   
@@@ -66,7 -67,7 +67,7 @@@
         spin_unlock(&wb->list_lock);
   
         global_dirty_limits(&background_thresh, &dirty_thresh);
-       bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+       wb_thresh = wb_calc_thresh(wb, dirty_thresh);
   
   #define K(x) ((x) << (PAGE_SHIFT - 10))
         seq_printf(m,
@@@ -84,19 -85,19 +85,19 @@@
                    "b_dirty_time:       %10lu\n"
                    "bdi_list:           %10u\n"
                    "state:              %10lx\n",
-                  (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
-                  (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-                  K(bdi_thresh),
+                  (unsigned long) K(wb_stat(wb, WB_WRITEBACK)),
+                  (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)),
+                  K(wb_thresh),
                    K(dirty_thresh),
                    K(background_thresh),
-                  (unsigned long) K(bdi_stat(bdi, BDI_DIRTIED)),
-                  (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
-                  (unsigned long) K(bdi->write_bandwidth),
+                  (unsigned long) K(wb_stat(wb, WB_DIRTIED)),
+                  (unsigned long) K(wb_stat(wb, WB_WRITTEN)),
+                  (unsigned long) K(wb->write_bandwidth),
                    nr_dirty,
                    nr_io,
                    nr_more_io,
                    nr_dirty_time,
-                  !list_empty(&bdi->bdi_list), bdi->state);
+                  !list_empty(&bdi->bdi_list), bdi->wb.state);
   #undef K
   
         return 0;
@@@ -255,13 -256,8 +256,8 @@@ static int __init default_bdi_init(void
   }
   subsys_initcall(default_bdi_init);
   
- int bdi_has_dirty_io(struct backing_dev_info *bdi)
- {
-       return wb_has_dirty_io(&bdi->wb);
- }
- 
   /*
-  * This function is used when the first inode for this bdi is marked dirty. It
+  * This function is used when the first inode for this wb is marked dirty. It
    * wakes-up the corresponding bdi thread which should then take care of the
    * periodic background write-out of dirty inodes. Since the write-out would
    * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
@@@ -274,162 -270,567 +270,550 @@@
    * We have to be careful not to postpone flush work if it is scheduled for
    * earlier. Thus we use queue_delayed_work().
    */
- void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
+ void wb_wakeup_delayed(struct bdi_writeback *wb)
   {
         unsigned long timeout;
   
         timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-       spin_lock_bh(&bdi->wb_lock);
-       if (test_bit(BDI_registered, &bdi->state))
-               queue_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_lock_bh(&wb->work_lock);
+       if (test_bit(WB_registered, &wb->state))
+               queue_delayed_work(bdi_wq, &wb->dwork, timeout);
+       spin_unlock_bh(&wb->work_lock);
   }
   
   /*
-  * Remove bdi from bdi_list, and ensure that it is no longer visible
+  * Initial write bandwidth: 100 MB/s
    */
- static void bdi_remove_from_list(struct backing_dev_info *bdi)
- {
-       spin_lock_bh(&bdi_lock);
-       list_del_rcu(&bdi->bdi_list);
-       spin_unlock_bh(&bdi_lock);
- 
-       synchronize_rcu_expedited();
- }
+ #define INIT_BW               (100 << (20 - PAGE_SHIFT))
   
- int bdi_register(struct backing_dev_info *bdi, struct device *parent,
-               const char *fmt, ...)
+ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
+                  gfp_t gfp)
   {
-       va_list args;
-       struct device *dev;
+       int i, err;
   
-       if (bdi->dev)   /* The driver needs to use separate queues per device */
-               return 0;
+       memset(wb, 0, sizeof(*wb));
   
-       va_start(args, fmt);
-       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
-       va_end(args);
-       if (IS_ERR(dev))
-               return PTR_ERR(dev);
+       wb->bdi = bdi;
+       wb->last_old_flush = jiffies;
+       INIT_LIST_HEAD(&wb->b_dirty);
+       INIT_LIST_HEAD(&wb->b_io);
+       INIT_LIST_HEAD(&wb->b_more_io);
+       INIT_LIST_HEAD(&wb->b_dirty_time);
+       spin_lock_init(&wb->list_lock);
   
-       bdi->dev = dev;
+       wb->bw_time_stamp = jiffies;
+       wb->balanced_dirty_ratelimit = INIT_BW;
+       wb->dirty_ratelimit = INIT_BW;
+       wb->write_bandwidth = INIT_BW;
+       wb->avg_write_bandwidth = INIT_BW;
   
-       bdi_debug_register(bdi, dev_name(dev));
-       set_bit(BDI_registered, &bdi->state);
+       spin_lock_init(&wb->work_lock);
+       INIT_LIST_HEAD(&wb->work_list);
+       INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
   
-       spin_lock_bh(&bdi_lock);
-       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
-       spin_unlock_bh(&bdi_lock);
+       err = fprop_local_init_percpu(&wb->completions, gfp);
+       if (err)
+               return err;
   
-       trace_writeback_bdi_register(bdi);
-       return 0;
- }
- EXPORT_SYMBOL(bdi_register);
+       for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
+               err = percpu_counter_init(&wb->stat[i], 0, gfp);
+               if (err) {
+                       while (--i)
+                               percpu_counter_destroy(&wb->stat[i]);
+                       fprop_local_destroy_percpu(&wb->completions);
+                       return err;
+               }
+       }
   
- int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
- {
-       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+       return 0;
   }
- EXPORT_SYMBOL(bdi_register_dev);
   
   /*
    * Remove bdi from the global list and shutdown any threads we have running
    */
- static void bdi_wb_shutdown(struct backing_dev_info *bdi)
+ static void wb_shutdown(struct bdi_writeback *wb)
   {
         /* Make sure nobody queues further work */
-       spin_lock_bh(&bdi->wb_lock);
-       if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
-               spin_unlock_bh(&bdi->wb_lock);
+       spin_lock_bh(&wb->work_lock);
+       if (!test_and_clear_bit(WB_registered, &wb->state)) {
+               spin_unlock_bh(&wb->work_lock);
                 return;
         }
-       spin_unlock_bh(&bdi->wb_lock);
+       spin_unlock_bh(&wb->work_lock);
   
         /*
-        * Make sure nobody finds us on the bdi_list anymore
+        * Drain work list and shutdown the delayed_work.  !WB_registered
+        * tells wb_workfn() that @wb is dying and its work_list needs to
+        * be drained no matter what.
          */
-       bdi_remove_from_list(bdi);
+       mod_delayed_work(bdi_wq, &wb->dwork, 0);
+       flush_delayed_work(&wb->dwork);
+       WARN_ON(!list_empty(&wb->work_list));
+ }
+ 
+ static void wb_exit(struct bdi_writeback *wb)
+ {
+       int i;
+ 
+       WARN_ON(delayed_work_pending(&wb->dwork));
+ 
+       for (i = 0; i < NR_WB_STAT_ITEMS; i++)
+               percpu_counter_destroy(&wb->stat[i]);
+ 
+       fprop_local_destroy_percpu(&wb->completions);
+ }
+ 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 
+ #include <linux/memcontrol.h>
+ 
+ /*
+  * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
+  * blkcg->cgwb_list, and memcg->cgwb_list.  bdi->cgwb_tree is also RCU
+  * protected.  cgwb_release_wait is used to wait for the completion of cgwb
+  * releases from bdi destruction path.
+  */
+ static DEFINE_SPINLOCK(cgwb_lock);
+ static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
+ 
+ /**
+  * wb_congested_get_create - get or create a wb_congested
+  * @bdi: associated bdi
+  * @blkcg_id: ID of the associated blkcg
+  * @gfp: allocation mask
+  *
+  * Look up the wb_congested for @blkcg_id on @bdi.  If missing, create one.
+  * The returned wb_congested has its reference count incremented.  Returns
+  * NULL on failure.
+  */
+ struct bdi_writeback_congested *
+ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
+ {
+       struct bdi_writeback_congested *new_congested = NULL, *congested;
+       struct rb_node **node, *parent;
+       unsigned long flags;
+ 
+       if (blkcg_id == 1)
+               return &bdi->wb_congested;
+ retry:
+       spin_lock_irqsave(&cgwb_lock, flags);
+ 
+       node = &bdi->cgwb_congested_tree.rb_node;
+       parent = NULL;
+ 
+       while (*node != NULL) {
+               parent = *node;
+               congested = container_of(parent, struct bdi_writeback_congested,
+                                        rb_node);
+               if (congested->blkcg_id < blkcg_id)
+                       node = &parent->rb_left;
+               else if (congested->blkcg_id > blkcg_id)
+                       node = &parent->rb_right;
+               else
+                       goto found;
+       }
+ 
+       if (new_congested) {
+               /* !found and storage for new one already allocated, insert */
+               congested = new_congested;
+               new_congested = NULL;
+               rb_link_node(&congested->rb_node, parent, node);
+               rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
+               atomic_inc(&bdi->usage_cnt);
+               goto found;
+       }
+ 
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+ 
+       /* allocate storage for new one and retry */
+       new_congested = kzalloc(sizeof(*new_congested), gfp);
+       if (!new_congested)
+               return NULL;
+ 
+       atomic_set(&new_congested->refcnt, 0);
+       new_congested->bdi = bdi;
+       new_congested->blkcg_id = blkcg_id;
+       goto retry;
+ 
+ found:
+       atomic_inc(&congested->refcnt);
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       kfree(new_congested);
+       return congested;
+ }
+ 
+ /**
+  * wb_congested_put - put a wb_congested
+  * @congested: wb_congested to put
+  *
+  * Put @congested and destroy it if the refcnt reaches zero.
+  */
+ void wb_congested_put(struct bdi_writeback_congested *congested)
+ {
+       struct backing_dev_info *bdi = congested->bdi;
+       unsigned long flags;
+ 
+       if (congested->blkcg_id == 1)
+               return;
+ 
+       local_irq_save(flags);
+       if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) {
+               local_irq_restore(flags);
+               return;
+       }
+ 
+       rb_erase(&congested->rb_node, &congested->bdi->cgwb_congested_tree);
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       kfree(congested);
+ 
+       if (atomic_dec_and_test(&bdi->usage_cnt))
+               wake_up_all(&cgwb_release_wait);
+ }
+ 
+ static void cgwb_release_workfn(struct work_struct *work)
+ {
+       struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+                                               release_work);
+       struct backing_dev_info *bdi = wb->bdi;
+ 
+       wb_shutdown(wb);
+ 
+       css_put(wb->memcg_css);
+       css_put(wb->blkcg_css);
+       wb_congested_put(wb->congested);
+ 
+       fprop_local_destroy_percpu(&wb->memcg_completions);
+       percpu_ref_exit(&wb->refcnt);
+       wb_exit(wb);
+       kfree_rcu(wb, rcu);
+ 
+       if (atomic_dec_and_test(&bdi->usage_cnt))
+               wake_up_all(&cgwb_release_wait);
+ }
+ 
+ static void cgwb_release(struct percpu_ref *refcnt)
+ {
+       struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback,
+                                               refcnt);
+       schedule_work(&wb->release_work);
+ }
+ 
+ static void cgwb_kill(struct bdi_writeback *wb)
+ {
+       lockdep_assert_held(&cgwb_lock);
+ 
+       WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id));
+       list_del(&wb->memcg_node);
+       list_del(&wb->blkcg_node);
+       percpu_ref_kill(&wb->refcnt);
+ }
+ 
+ static int cgwb_create(struct backing_dev_info *bdi,
+                      struct cgroup_subsys_state *memcg_css, gfp_t gfp)
+ {
+       struct mem_cgroup *memcg;
+       struct cgroup_subsys_state *blkcg_css;
+       struct blkcg *blkcg;
+       struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
+       struct bdi_writeback *wb;
+       unsigned long flags;
+       int ret = 0;
+ 
+       memcg = mem_cgroup_from_css(memcg_css);
+       blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys);
+       blkcg = css_to_blkcg(blkcg_css);
+       memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+       blkcg_cgwb_list = &blkcg->cgwb_list;
+ 
+       /* look up again under lock and discard on blkcg mismatch */
+       spin_lock_irqsave(&cgwb_lock, flags);
+       wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+       if (wb && wb->blkcg_css != blkcg_css) {
+               cgwb_kill(wb);
+               wb = NULL;
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       if (wb)
+               goto out_put;
+ 
+       /* need to create a new one */
+       wb = kmalloc(sizeof(*wb), gfp);
+       if (!wb)
+               return -ENOMEM;
+ 
+       ret = wb_init(wb, bdi, gfp);
+       if (ret)
+               goto err_free;
+ 
+       ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp);
+       if (ret)
+               goto err_wb_exit;
+ 
+       ret = fprop_local_init_percpu(&wb->memcg_completions, gfp);
+       if (ret)
+               goto err_ref_exit;
+ 
+       wb->congested = wb_congested_get_create(bdi, blkcg_css->id, gfp);
+       if (!wb->congested) {
+               ret = -ENOMEM;
+               goto err_fprop_exit;
+       }
+ 
+       wb->memcg_css = memcg_css;
+       wb->blkcg_css = blkcg_css;
+       INIT_WORK(&wb->release_work, cgwb_release_workfn);
+       set_bit(WB_registered, &wb->state);
   
         /*
-        * Drain work list and shutdown the delayed_work.  At this point,
-        * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
-        * is dying and its work_list needs to be drained no matter what.
+        * The root wb determines the registered state of the whole bdi and
+        * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate
+        * whether they're still online.  Don't link @wb if any is dead.
+        * See wb_memcg_offline() and wb_blkcg_offline().
          */
-       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
-       flush_delayed_work(&bdi->wb.dwork);
+       ret = -ENODEV;
+       spin_lock_irqsave(&cgwb_lock, flags);
+       if (test_bit(WB_registered, &bdi->wb.state) &&
+           blkcg_cgwb_list->next && memcg_cgwb_list->next) {
+               /* we might have raced another instance of this function */
+               ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
+               if (!ret) {
+                       atomic_inc(&bdi->usage_cnt);
+                       list_add(&wb->memcg_node, memcg_cgwb_list);
+                       list_add(&wb->blkcg_node, blkcg_cgwb_list);
+                       css_get(memcg_css);
+                       css_get(blkcg_css);
+               }
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       if (ret) {
+               if (ret == -EEXIST)
+                       ret = 0;
+               goto err_put_congested;
+       }
+       goto out_put;
+ 
+ err_put_congested:
+       wb_congested_put(wb->congested);
+ err_fprop_exit:
+       fprop_local_destroy_percpu(&wb->memcg_completions);
+ err_ref_exit:
+       percpu_ref_exit(&wb->refcnt);
+ err_wb_exit:
+       wb_exit(wb);
+ err_free:
+       kfree(wb);
+ out_put:
+       css_put(blkcg_css);
+       return ret;
   }
   
- static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
+ /**
+  * wb_get_create - get wb for a given memcg, create if necessary
+  * @bdi: target bdi
+  * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+  * @gfp: allocation mask to use
+  *
+  * Try to get the wb for @memcg_css on @bdi.  If it doesn't exist, try to
+  * create one.  The returned wb has its refcount incremented.
+  *
+  * This function uses css_get() on @memcg_css and thus expects its refcnt
+  * to be positive on invocation.  IOW, rcu_read_lock() protection on
+  * @memcg_css isn't enough.  try_get it before calling this function.
+  *
+  * A wb is keyed by its associated memcg.  As blkcg implicitly enables
+  * memcg on the default hierarchy, memcg association is guaranteed to be
+  * more specific (equal or descendant to the associated blkcg) and thus can
+  * identify both the memcg and blkcg associations.
+  *
+  * Because the blkcg associated with a memcg may change as blkcg is enabled
+  * and disabled closer to root in the hierarchy, each wb keeps track of
+  * both the memcg and blkcg associated with it and verifies the blkcg on
+  * each lookup.  On mismatch, the existing wb is discarded and a new one is
+  * created.
+  */
+ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
+                                   struct cgroup_subsys_state *memcg_css,
+                                   gfp_t gfp)
   {
-       memset(wb, 0, sizeof(*wb));
+       struct bdi_writeback *wb;
+ 
+       might_sleep_if(gfp & __GFP_WAIT);
+ 
+       if (!memcg_css->parent)
+               return &bdi->wb;
+ 
+       do {
+               rcu_read_lock();
+               wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+               if (wb) {
+                       struct cgroup_subsys_state *blkcg_css;
+ 
+                       /* see whether the blkcg association has changed */
+                       blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
+                                                    &blkio_cgrp_subsys);
+                       if (unlikely(wb->blkcg_css != blkcg_css ||
+                                    !wb_tryget(wb)))
+                               wb = NULL;
+                       css_put(blkcg_css);
+               }
+               rcu_read_unlock();
+       } while (!wb && !cgwb_create(bdi, memcg_css, gfp));
+ 
+       return wb;
+ }
   
-       wb->bdi = bdi;
-       wb->last_old_flush = jiffies;
-       INIT_LIST_HEAD(&wb->b_dirty);
-       INIT_LIST_HEAD(&wb->b_io);
-       INIT_LIST_HEAD(&wb->b_more_io);
-       INIT_LIST_HEAD(&wb->b_dirty_time);
-       spin_lock_init(&wb->list_lock);
-       INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
+ static void cgwb_bdi_init(struct backing_dev_info *bdi)
+ {
+       bdi->wb.memcg_css = mem_cgroup_root_css;
+       bdi->wb.blkcg_css = blkcg_root_css;
+       bdi->wb_congested.blkcg_id = 1;
+       INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+       bdi->cgwb_congested_tree = RB_ROOT;
+       atomic_set(&bdi->usage_cnt, 1);
   }
   
- /*
-  * Initial write bandwidth: 100 MB/s
+ static void cgwb_bdi_destroy(struct backing_dev_info *bdi)
+ {
+       struct radix_tree_iter iter;
+       void **slot;
+ 
+       WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+ 
+       spin_lock_irq(&cgwb_lock);
+       radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+               cgwb_kill(*slot);
+       spin_unlock_irq(&cgwb_lock);
+ 
+       /*
+        * All cgwb's and their congested states must be shutdown and
+        * released before returning.  Drain the usage counter to wait for
+        * all cgwb's and cgwb_congested's ever created on @bdi.
+        */
+       atomic_dec(&bdi->usage_cnt);
+       wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt));
+ }
+ 
+ /**
+  * wb_memcg_offline - kill all wb's associated with a memcg being offlined
+  * @memcg: memcg being offlined
+  *
+  * Also prevents creation of any new wb's associated with @memcg.
    */
- #define INIT_BW               (100 << (20 - PAGE_SHIFT))
+ void wb_memcg_offline(struct mem_cgroup *memcg)
+ {
+       LIST_HEAD(to_destroy);
+       struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg);
+       struct bdi_writeback *wb, *next;
+ 
+       spin_lock_irq(&cgwb_lock);
+       list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node)
+               cgwb_kill(wb);
+       memcg_cgwb_list->next = NULL;   /* prevent new wb's */
+       spin_unlock_irq(&cgwb_lock);
+ }
+ 
+ /**
+  * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
+  * @blkcg: blkcg being offlined
+  *
+  * Also prevents creation of any new wb's associated with @blkcg.
+  */
+ void wb_blkcg_offline(struct blkcg *blkcg)
+ {
+       LIST_HEAD(to_destroy);
+       struct bdi_writeback *wb, *next;
+ 
+       spin_lock_irq(&cgwb_lock);
+       list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+               cgwb_kill(wb);
+       blkcg->cgwb_list.next = NULL;   /* prevent new wb's */
+       spin_unlock_irq(&cgwb_lock);
+ }
+ 
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ 
+ static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { }
+ 
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
   
   int bdi_init(struct backing_dev_info *bdi)
   {
-       int i, err;
+       int err;
   
         bdi->dev = NULL;
   
         bdi->min_ratio = 0;
         bdi->max_ratio = 100;
         bdi->max_prop_frac = FPROP_FRAC_BASE;
-       spin_lock_init(&bdi->wb_lock);
         INIT_LIST_HEAD(&bdi->bdi_list);
-       INIT_LIST_HEAD(&bdi->work_list);
+       init_waitqueue_head(&bdi->wb_waitq);
   
-       bdi_wb_init(&bdi->wb, bdi);
+       err = wb_init(&bdi->wb, bdi, GFP_KERNEL);
+       if (err)
+               return err;
   
-       for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
-               err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
-               if (err)
-                       goto err;
-       }
+       bdi->wb_congested.state = 0;
+       bdi->wb.congested = &bdi->wb_congested;
   
-       bdi->dirty_exceeded = 0;
+       cgwb_bdi_init(bdi);
+       return 0;
+ }
+ EXPORT_SYMBOL(bdi_init);
   
-       bdi->bw_time_stamp = jiffies;
-       bdi->written_stamp = 0;
+ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
+               const char *fmt, ...)
+ {
+       va_list args;
+       struct device *dev;
   
-       bdi->balanced_dirty_ratelimit = INIT_BW;
-       bdi->dirty_ratelimit = INIT_BW;
-       bdi->write_bandwidth = INIT_BW;
-       bdi->avg_write_bandwidth = INIT_BW;
+       if (bdi->dev)   /* The driver needs to use separate queues per device */
+               return 0;
   
-       err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
+       va_start(args, fmt);
+       dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args);
+       va_end(args);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
   
-       if (err) {
- err:
-               while (i--)
-                       percpu_counter_destroy(&bdi->bdi_stat[i]);
-       }
+       bdi->dev = dev;
   
-       return err;
+       bdi_debug_register(bdi, dev_name(dev));
+       set_bit(WB_registered, &bdi->wb.state);
+ 
+       spin_lock_bh(&bdi_lock);
+       list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+       spin_unlock_bh(&bdi_lock);
+ 
+       trace_writeback_bdi_register(bdi);
+       return 0;
   }
- EXPORT_SYMBOL(bdi_init);
+ EXPORT_SYMBOL(bdi_register);
   
- void bdi_destroy(struct backing_dev_info *bdi)
+ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev)
   {
-       int i;
+       return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev));
+ }
+ EXPORT_SYMBOL(bdi_register_dev);
+ 
+ /*
+  * Remove bdi from bdi_list, and ensure that it is no longer visible
+  */
+ static void bdi_remove_from_list(struct backing_dev_info *bdi)
+ {
+       spin_lock_bh(&bdi_lock);
+       list_del_rcu(&bdi->bdi_list);
+       spin_unlock_bh(&bdi_lock);
   
-       bdi_wb_shutdown(bdi);
-       bdi_set_min_ratio(bdi, 0);
+       synchronize_rcu_expedited();
+ }
   
-       WARN_ON(!list_empty(&bdi->work_list));
-       WARN_ON(delayed_work_pending(&bdi->wb.dwork));
- -/*
- - * Called when the device behind @bdi has been removed or ejected.
- - *
- - * We can't really do much here except for reducing the dirty ratio at
- - * the moment.  In the future we should be able to set a flag so that
- - * the filesystem can handle errors at mark_inode_dirty time instead
- - * of only at writeback time.
- - */
- -void bdi_unregister(struct backing_dev_info *bdi)
- -{
- -      if (WARN_ON_ONCE(!bdi->dev))
- -              return;
- -
- -      bdi_set_min_ratio(bdi, 0);
- -}
- -EXPORT_SYMBOL(bdi_unregister);
- -
+ void bdi_destroy(struct backing_dev_info *bdi)
+ {
+       /* make sure nobody finds us on the bdi_list anymore */
+       bdi_remove_from_list(bdi);
+       wb_shutdown(&bdi->wb);
+       cgwb_bdi_destroy(bdi);
   
         if (bdi->dev) {
                 bdi_debug_unregister(bdi);
@@@ -437,9 -838,7 +821,7 @@@
                 bdi->dev = NULL;
         }
   
-       for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
-               percpu_counter_destroy(&bdi->bdi_stat[i]);
-       fprop_local_destroy_percpu(&bdi->completions);
+       wb_exit(&bdi->wb);
   }
   EXPORT_SYMBOL(bdi_destroy);
   
@@@ -472,31 -871,31 +854,31 @@@ static wait_queue_head_t congestion_wqh
                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
         };
- static atomic_t nr_bdi_congested[2];
+ static atomic_t nr_wb_congested[2];
   
- void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
+ void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
   {
-       enum bdi_state bit;
         wait_queue_head_t *wqh = &congestion_wqh[sync];
+       enum wb_state bit;
   
-       bit = sync ? BDI_sync_congested : BDI_async_congested;
-       if (test_and_clear_bit(bit, &bdi->state))
-               atomic_dec(&nr_bdi_congested[sync]);
+       bit = sync ? WB_sync_congested : WB_async_congested;
+       if (test_and_clear_bit(bit, &congested->state))
+               atomic_dec(&nr_wb_congested[sync]);
         smp_mb__after_atomic();
         if (waitqueue_active(wqh))
                 wake_up(wqh);
   }
- EXPORT_SYMBOL(clear_bdi_congested);
+ EXPORT_SYMBOL(clear_wb_congested);
   
- void set_bdi_congested(struct backing_dev_info *bdi, int sync)
+ void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
   {
-       enum bdi_state bit;
+       enum wb_state bit;
   
-       bit = sync ? BDI_sync_congested : BDI_async_congested;
-       if (!test_and_set_bit(bit, &bdi->state))
-               atomic_inc(&nr_bdi_congested[sync]);
+       bit = sync ? WB_sync_congested : WB_async_congested;
+       if (!test_and_set_bit(bit, &congested->state))
+               atomic_inc(&nr_wb_congested[sync]);
   }
- EXPORT_SYMBOL(set_bdi_congested);
+ EXPORT_SYMBOL(set_wb_congested);
   
   /**
    * congestion_wait - wait for a backing_dev to become uncongested
@@@ -555,7 -954,7 +937,7 @@@ long wait_iff_congested(struct zone *zo
          * encountered in the current zone, yield if necessary instead
          * of sleeping on the congestion queue
          */
-       if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+       if (atomic_read(&nr_wb_congested[sync]) == 0 ||
             !test_bit(ZONE_CONGESTED, &zone->flags)) {
                 cond_resched();
   
diff --combined mm/filemap.c

index 8d17ceea8dbeb1f641687407f2a27ebbff480533,bfc1ab053b1224acd96a1851e265f6872d744881..11f10efd637c2d67e071c482951e2bb38a105d6b
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -100,6 -100,7 +100,7 @@@
    *    ->tree_lock             (page_remove_rmap->set_page_dirty)
    *    bdi.wb->list_lock               (page_remove_rmap->set_page_dirty)
    *    ->inode->i_lock         (page_remove_rmap->set_page_dirty)
+  *    ->memcg->move_lock      (page_remove_rmap->mem_cgroup_begin_page_stat)
    *    bdi.wb->list_lock               (zap_pte_range->set_page_dirty)
    *    ->inode->i_lock         (zap_pte_range->set_page_dirty)
    *    ->private_lock          (zap_pte_range->__set_page_dirty_buffers)
@@@ -174,9 -175,11 +175,11 @@@ static void page_cache_tree_delete(stru
   /*
    * Delete a page from the page cache and free it. Caller has to make
    * sure the page is locked and that nobody else uses it - or that usage
-  * is safe.  The caller must hold the mapping's tree_lock.
+  * is safe.  The caller must hold the mapping's tree_lock and
+  * mem_cgroup_begin_page_stat().
    */
- void __delete_from_page_cache(struct page *page, void *shadow)
+ void __delete_from_page_cache(struct page *page, void *shadow,
+                             struct mem_cgroup *memcg)
   {
         struct address_space *mapping = page->mapping;
   
@@@ -196,9 -199,7 +199,9 @@@
         page->mapping = NULL;
         /* Leave page->index set: truncation lookup relies upon it */
   
- -      __dec_zone_page_state(page, NR_FILE_PAGES);
+ +      /* hugetlb pages do not participate in page cache accounting. */
+ +      if (!PageHuge(page))
+ +              __dec_zone_page_state(page, NR_FILE_PAGES);
         if (PageSwapBacked(page))
                 __dec_zone_page_state(page, NR_SHMEM);
         BUG_ON(page_mapped(page));
@@@ -212,7 -213,8 +215,8 @@@
          * anyway will be cleared before returning page into buddy allocator.
          */
         if (WARN_ON_ONCE(PageDirty(page)))
-               account_page_cleaned(page, mapping);
+               account_page_cleaned(page, mapping, memcg,
+                                    inode_to_wb(mapping->host));
   }
   
   /**
@@@ -226,14 -228,20 +230,20 @@@
   void delete_from_page_cache(struct page *page)
   {
         struct address_space *mapping = page->mapping;
+       struct mem_cgroup *memcg;
+       unsigned long flags;
+ 
         void (*freepage)(struct page *);
   
         BUG_ON(!PageLocked(page));
   
         freepage = mapping->a_ops->freepage;
-       spin_lock_irq(&mapping->tree_lock);
-       __delete_from_page_cache(page, NULL);
-       spin_unlock_irq(&mapping->tree_lock);
+ 
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
+       __delete_from_page_cache(page, NULL, memcg);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
   
         if (freepage)
                 freepage(page);
@@@ -283,7 -291,9 +293,9 @@@ int __filemap_fdatawrite_range(struct a
         if (!mapping_cap_writeback_dirty(mapping))
                 return 0;
   
+       wbc_attach_fdatawrite_inode(&wbc, mapping->host);
         ret = do_writepages(mapping, &wbc);
+       wbc_detach_inode(&wbc);
         return ret;
   }
   
@@@ -472,6 -482,8 +484,8 @@@ int replace_page_cache_page(struct pag
         if (!error) {
                 struct address_space *mapping = old->mapping;
                 void (*freepage)(struct page *);
+               struct mem_cgroup *memcg;
+               unsigned long flags;
   
                 pgoff_t offset = old->index;
                 freepage = mapping->a_ops->freepage;
@@@ -480,20 -492,17 +494,22 @@@
                 new->mapping = mapping;
                 new->index = offset;
   
-               spin_lock_irq(&mapping->tree_lock);
-               __delete_from_page_cache(old, NULL);
+               memcg = mem_cgroup_begin_page_stat(old);
+               spin_lock_irqsave(&mapping->tree_lock, flags);
+               __delete_from_page_cache(old, NULL, memcg);
                 error = radix_tree_insert(&mapping->page_tree, offset, new);
                 BUG_ON(error);
                 mapping->nrpages++;
- -              __inc_zone_page_state(new, NR_FILE_PAGES);
+ +
+ +              /*
+ +               * hugetlb pages do not participate in page cache accounting.
+ +               */
+ +              if (!PageHuge(new))
+ +                      __inc_zone_page_state(new, NR_FILE_PAGES);
                 if (PageSwapBacked(new))
                         __inc_zone_page_state(new, NR_SHMEM);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                 mem_cgroup_migrate(old, new, true);
                 radix_tree_preload_end();
                 if (freepage)
@@@ -582,10 -591,7 +598,10 @@@ static int __add_to_page_cache_locked(s
         radix_tree_preload_end();
         if (unlikely(error))
                 goto err_insert;
- -      __inc_zone_page_state(page, NR_FILE_PAGES);
+ +
+ +      /* hugetlb pages do not participate in page cache accounting. */
+ +      if (!huge)
+ +              __inc_zone_page_state(page, NR_FILE_PAGES);
         spin_unlock_irq(&mapping->tree_lock);
         if (!huge)
                 mem_cgroup_commit_charge(page, memcg, false);
@@@ -1664,8 -1670,8 +1680,8 @@@ no_cached_page
                         error = -ENOMEM;
                         goto out;
                 }
- -              error = add_to_page_cache_lru(page, mapping,
- -                                              index, GFP_KERNEL);
+ +              error = add_to_page_cache_lru(page, mapping, index,
+ +                                      GFP_KERNEL & mapping_gfp_mask(mapping));
                 if (error) {
                         page_cache_release(page);
                         if (error == -EEXIST) {
@@@ -1766,8 -1772,7 +1782,8 @@@ static int page_cache_read(struct file 
                 if (!page)
                         return -ENOMEM;
   
- -              ret = add_to_page_cache_lru(page, mapping, offset, GFP_KERNEL);
+ +              ret = add_to_page_cache_lru(page, mapping, offset,
+ +                              GFP_KERNEL & mapping_gfp_mask(mapping));
                 if (ret == 0)
                         ret = mapping->a_ops->readpage(file, page);
                 else if (ret == -EEXIST)
diff --combined mm/memcontrol.c

index e65f7b0131d3598cb5ba0ce3497d47b43d676dea,f816d91c643b7ee59af809b72f846ddddced8924..acb93c554f6e8456dc9312734162317d1adea54d
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -77,6 -77,7 +77,7 @@@ EXPORT_SYMBOL(memory_cgrp_subsys)
   
   #define MEM_CGROUP_RECLAIM_RETRIES    5
   static struct mem_cgroup *root_mem_cgroup __read_mostly;
+ struct cgroup_subsys_state *mem_cgroup_root_css __read_mostly;
   
   /* Whether the swap controller is active */
   #ifdef CONFIG_MEMCG_SWAP
@@@ -90,6 -91,7 +91,7 @@@ static const char * const mem_cgroup_st
         "rss",
         "rss_huge",
         "mapped_file",
+       "dirty",
         "writeback",
         "swap",
   };
@@@ -285,9 -287,9 +287,9 @@@ struct mem_cgroup 
          */
         bool use_hierarchy;
   
+ +      /* protected by memcg_oom_lock */
         bool            oom_lock;
- -      atomic_t        under_oom;
- -      atomic_t        oom_wakeups;
+ +      int             under_oom;
   
         int     swappiness;
         /* OOM-Killer disable */
@@@ -322,11 -324,6 +324,6 @@@
          * percpu counter.
          */
         struct mem_cgroup_stat_cpu __percpu *stat;
-       /*
-        * used when a cpu is offlined or other synchronizations
-        * See mem_cgroup_read_stat().
-        */
-       struct mem_cgroup_stat_cpu nocpu_base;
         spinlock_t pcp_counter_lock;
   
   #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
@@@ -346,6 -343,11 +343,11 @@@
         atomic_t        numainfo_updating;
   #endif
   
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head cgwb_list;
+       struct wb_domain cgwb_domain;
+ #endif
+ 
         /* List of events which userspace want to receive */
         struct list_head event_list;
         spinlock_t event_list_lock;
@@@ -596,6 -598,39 +598,39 @@@ struct cgroup_subsys_state *mem_cgroup_
         return &memcg->css;
   }
   
+ /**
+  * mem_cgroup_css_from_page - css of the memcg associated with a page
+  * @page: page of interest
+  *
+  * If memcg is bound to the default hierarchy, css of the memcg associated
+  * with @page is returned.  The returned css remains associated with @page
+  * until it is released.
+  *
+  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
+  * is returned.
+  *
+  * XXX: The above description of behavior on the default hierarchy isn't
+  * strictly true yet as replace_page_cache_page() can modify the
+  * association before @page is released even on the default hierarchy;
+  * however, the current and planned usages don't mix the the two functions
+  * and replace_page_cache_page() will soon be updated to make the invariant
+  * actually true.
+  */
+ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
+ {
+       struct mem_cgroup *memcg;
+ 
+       rcu_read_lock();
+ 
+       memcg = page->mem_cgroup;
+ 
+       if (!memcg || !cgroup_on_dfl(memcg->css.cgroup))
+               memcg = root_mem_cgroup;
+ 
+       rcu_read_unlock();
+       return &memcg->css;
+ }
+ 
   static struct mem_cgroup_per_zone *
   mem_cgroup_page_zoneinfo(struct mem_cgroup *memcg, struct page *page)
   {
@@@ -795,15 -830,8 +830,8 @@@ static long mem_cgroup_read_stat(struc
         long val = 0;
         int cpu;
   
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                 val += per_cpu(memcg->stat->count[idx], cpu);
- #ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.count[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
- #endif
-       put_online_cpus();
         return val;
   }
   
@@@ -813,15 -841,8 +841,8 @@@ static unsigned long mem_cgroup_read_ev
         unsigned long val = 0;
         int cpu;
   
-       get_online_cpus();
-       for_each_online_cpu(cpu)
+       for_each_possible_cpu(cpu)
                 val += per_cpu(memcg->stat->events[idx], cpu);
- #ifdef CONFIG_HOTPLUG_CPU
-       spin_lock(&memcg->pcp_counter_lock);
-       val += memcg->nocpu_base.events[idx];
-       spin_unlock(&memcg->pcp_counter_lock);
- #endif
-       put_online_cpus();
         return val;
   }
   
@@@ -1530,16 -1551,14 +1551,16 @@@ static void mem_cgroup_out_of_memory(st
         unsigned int points = 0;
         struct task_struct *chosen = NULL;
   
+ +      mutex_lock(&oom_lock);
+ +
         /*
          * If current has a pending SIGKILL or is exiting, then automatically
          * select it.  The goal is to allow it to allocate so that it may
          * quickly exit and free its memory.
          */
         if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- -              mark_tsk_oom_victim(current);
- -              return;
+ +              mark_oom_victim(current);
+ +              goto unlock;
         }
   
         check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL, memcg);
@@@ -1566,7 -1585,7 +1587,7 @@@
                                 mem_cgroup_iter_break(memcg, iter);
                                 if (chosen)
                                         put_task_struct(chosen);
- -                              return;
+ +                              goto unlock;
                         case OOM_SCAN_OK:
                                 break;
                         };
@@@ -1587,13 -1606,11 +1608,13 @@@
                 css_task_iter_end(&it);
         }
   
- -      if (!chosen)
- -              return;
- -      points = chosen_points * 1000 / totalpages;
- -      oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
- -                       NULL, "Memory cgroup out of memory");
+ +      if (chosen) {
+ +              points = chosen_points * 1000 / totalpages;
+ +              oom_kill_process(chosen, gfp_mask, order, points, totalpages,
+ +                               memcg, NULL, "Memory cgroup out of memory");
+ +      }
+ +unlock:
+ +      mutex_unlock(&oom_lock);
   }
   
   #if MAX_NUMNODES > 1
@@@ -1810,10 -1827,8 +1831,10 @@@ static void mem_cgroup_mark_under_oom(s
   {
         struct mem_cgroup *iter;
   
+ +      spin_lock(&memcg_oom_lock);
         for_each_mem_cgroup_tree(iter, memcg)
- -              atomic_inc(&iter->under_oom);
+ +              iter->under_oom++;
+ +      spin_unlock(&memcg_oom_lock);
   }
   
   static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
@@@ -1822,13 -1837,11 +1843,13 @@@
   
         /*
          * When a new child is created while the hierarchy is under oom,
- -       * mem_cgroup_oom_lock() may not be called. We have to use
- -       * atomic_add_unless() here.
+ +       * mem_cgroup_oom_lock() may not be called. Watch for underflow.
          */
+ +      spin_lock(&memcg_oom_lock);
         for_each_mem_cgroup_tree(iter, memcg)
- -              atomic_add_unless(&iter->under_oom, -1, 0);
+ +              if (iter->under_oom > 0)
+ +                      iter->under_oom--;
+ +      spin_unlock(&memcg_oom_lock);
   }
   
   static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
@@@ -1854,18 -1867,17 +1875,18 @@@ static int memcg_oom_wake_function(wait
         return autoremove_wake_function(wait, mode, sync, arg);
   }
   
- -static void memcg_wakeup_oom(struct mem_cgroup *memcg)
- -{
- -      atomic_inc(&memcg->oom_wakeups);
- -      /* for filtering, pass "memcg" as argument. */
- -      __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
- -}
- -
   static void memcg_oom_recover(struct mem_cgroup *memcg)
   {
- -      if (memcg && atomic_read(&memcg->under_oom))
- -              memcg_wakeup_oom(memcg);
+ +      /*
+ +       * For the following lockless ->under_oom test, the only required
+ +       * guarantee is that it must see the state asserted by an OOM when
+ +       * this function is called as a result of userland actions
+ +       * triggered by the notification of the OOM.  This is trivially
+ +       * achieved by invoking mem_cgroup_mark_under_oom() before
+ +       * triggering notification.
+ +       */
+ +      if (memcg && memcg->under_oom)
+ +              __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
   }
   
   static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
@@@ -2020,6 -2032,7 +2041,7 @@@ again
   
         return memcg;
   }
+ EXPORT_SYMBOL(mem_cgroup_begin_page_stat);
   
   /**
    * mem_cgroup_end_page_stat - finish a page state statistics transaction
@@@ -2038,6 -2051,7 +2060,7 @@@ void mem_cgroup_end_page_stat(struct me
   
         rcu_read_unlock();
   }
+ EXPORT_SYMBOL(mem_cgroup_end_page_stat);
   
   /**
    * mem_cgroup_update_page_stat - update page state statistics
@@@ -2178,37 -2192,12 +2201,12 @@@ static void drain_all_stock(struct mem_
         mutex_unlock(&percpu_charge_mutex);
   }
   
- /*
-  * This function drains percpu counter value from DEAD cpu and
-  * move it to local cpu. Note that this function can be preempted.
-  */
- static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
- {
-       int i;
- 
-       spin_lock(&memcg->pcp_counter_lock);
-       for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-               long x = per_cpu(memcg->stat->count[i], cpu);
- 
-               per_cpu(memcg->stat->count[i], cpu) = 0;
-               memcg->nocpu_base.count[i] += x;
-       }
-       for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-               unsigned long x = per_cpu(memcg->stat->events[i], cpu);
- 
-               per_cpu(memcg->stat->events[i], cpu) = 0;
-               memcg->nocpu_base.events[i] += x;
-       }
-       spin_unlock(&memcg->pcp_counter_lock);
- }
- 
   static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
                                         unsigned long action,
                                         void *hcpu)
   {
         int cpu = (unsigned long)hcpu;
         struct memcg_stock_pcp *stock;
-       struct mem_cgroup *iter;
   
         if (action == CPU_ONLINE)
                 return NOTIFY_OK;
@@@ -2216,9 -2205,6 +2214,6 @@@
         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
                 return NOTIFY_OK;
   
-       for_each_mem_cgroup(iter)
-               mem_cgroup_drain_pcp_counter(iter, cpu);
- 
         stock = &per_cpu(memcg_stock, cpu);
         drain_stock(stock);
         return NOTIFY_OK;
@@@ -2332,8 -2318,6 +2327,8 @@@ done_restock
         css_get_many(&memcg->css, batch);
         if (batch > nr_pages)
                 refill_stock(memcg, batch - nr_pages);
+ +      if (!(gfp_mask & __GFP_WAIT))
+ +              goto done;
         /*
          * If the hierarchy is above the normal consumption range,
          * make the charging task trim their excess contribution.
@@@ -3873,7 -3857,7 +3868,7 @@@ static int mem_cgroup_oom_register_even
         list_add(&event->list, &memcg->oom_notify);
   
         /* already in OOM ? */
- -      if (atomic_read(&memcg->under_oom))
+ +      if (memcg->under_oom)
                 eventfd_signal(eventfd, 1);
         spin_unlock(&memcg_oom_lock);
   
@@@ -3902,7 -3886,7 +3897,7 @@@ static int mem_cgroup_oom_control_read(
         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
   
         seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
- -      seq_printf(sf, "under_oom %d\n", (bool)atomic_read(&memcg->under_oom));
+ +      seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
         return 0;
   }
   
@@@ -4004,6 -3988,98 +3999,98 @@@ static void memcg_destroy_kmem(struct m
   }
   #endif
   
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 
+ struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
+ {
+       return &memcg->cgwb_list;
+ }
+ 
+ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+ {
+       return wb_domain_init(&memcg->cgwb_domain, gfp);
+ }
+ 
+ static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+ {
+       wb_domain_exit(&memcg->cgwb_domain);
+ }
+ 
+ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+ {
+       wb_domain_size_changed(&memcg->cgwb_domain);
+ }
+ 
+ struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ 
+       if (!memcg->css.parent)
+               return NULL;
+ 
+       return &memcg->cgwb_domain;
+ }
+ 
+ /**
+  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
+  * @wb: bdi_writeback in question
+  * @pavail: out parameter for number of available pages
+  * @pdirty: out parameter for number of dirty pages
+  * @pwriteback: out parameter for number of pages under writeback
+  *
+  * Determine the numbers of available, dirty, and writeback pages in @wb's
+  * memcg.  Dirty and writeback are self-explanatory.  Available is a bit
+  * more involved.
+  *
+  * A memcg's headroom is "min(max, high) - used".  The available memory is
+  * calculated as the lowest headroom of itself and the ancestors plus the
+  * number of pages already being used for file pages.  Note that this
+  * doesn't consider the actual amount of available memory in the system.
+  * The caller should further cap *@pavail accordingly.
+  */
+ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pavail,
+                        unsigned long *pdirty, unsigned long *pwriteback)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+       unsigned long head_room = PAGE_COUNTER_MAX;
+       unsigned long file_pages;
+ 
+       *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
+ 
+       /* this should eventually include NR_UNSTABLE_NFS */
+       *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
+ 
+       file_pages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
+                                                   (1 << LRU_ACTIVE_FILE));
+       while ((parent = parent_mem_cgroup(memcg))) {
+               unsigned long ceiling = min(memcg->memory.limit, memcg->high);
+               unsigned long used = page_counter_read(&memcg->memory);
+ 
+               head_room = min(head_room, ceiling - min(ceiling, used));
+               memcg = parent;
+       }
+ 
+       *pavail = file_pages + head_room;
+ }
+ 
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ 
+ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
+ {
+       return 0;
+ }
+ 
+ static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
+ {
+ }
+ 
+ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
+ {
+ }
+ 
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
+ 
   /*
    * DO NOT USE IN NEW FILES.
    *
@@@ -4388,9 -4464,15 +4475,15 @@@ static struct mem_cgroup *mem_cgroup_al
         memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
         if (!memcg->stat)
                 goto out_free;
+ 
+       if (memcg_wb_domain_init(memcg, GFP_KERNEL))
+               goto out_free_stat;
+ 
         spin_lock_init(&memcg->pcp_counter_lock);
         return memcg;
   
+ out_free_stat:
+       free_percpu(memcg->stat);
   out_free:
         kfree(memcg);
         return NULL;
@@@ -4417,6 -4499,7 +4510,7 @@@ static void __mem_cgroup_free(struct me
                 free_mem_cgroup_per_zone_info(memcg, node);
   
         free_percpu(memcg->stat);
+       memcg_wb_domain_exit(memcg);
         kfree(memcg);
   }
   
@@@ -4449,6 -4532,7 +4543,7 @@@ mem_cgroup_css_alloc(struct cgroup_subs
         /* root ? */
         if (parent_css == NULL) {
                 root_mem_cgroup = memcg;
+               mem_cgroup_root_css = &memcg->css;
                 page_counter_init(&memcg->memory, NULL);
                 memcg->high = PAGE_COUNTER_MAX;
                 memcg->soft_limit = PAGE_COUNTER_MAX;
@@@ -4467,7 -4551,9 +4562,9 @@@
   #ifdef CONFIG_MEMCG_KMEM
         memcg->kmemcg_id = -1;
   #endif
- 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&memcg->cgwb_list);
+ #endif
         return &memcg->css;
   
   free_out:
@@@ -4555,6 -4641,8 +4652,8 @@@ static void mem_cgroup_css_offline(stru
         vmpressure_cleanup(&memcg->vmpressure);
   
         memcg_deactivate_kmem(memcg);
+ 
+       wb_memcg_offline(memcg);
   }
   
   static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@@ -4588,6 -4676,7 +4687,7 @@@ static void mem_cgroup_css_reset(struc
         memcg->low = 0;
         memcg->high = PAGE_COUNTER_MAX;
         memcg->soft_limit = PAGE_COUNTER_MAX;
+       memcg_wb_domain_size_changed(memcg);
   }
   
   #ifdef CONFIG_MMU
@@@ -4757,6 -4846,7 +4857,7 @@@ static int mem_cgroup_move_account(stru
   {
         unsigned long flags;
         int ret;
+       bool anon;
   
         VM_BUG_ON(from == to);
         VM_BUG_ON_PAGE(PageLRU(page), page);
@@@ -4782,15 -4872,33 +4883,33 @@@
         if (page->mem_cgroup != from)
                 goto out_unlock;
   
+       anon = PageAnon(page);
+ 
         spin_lock_irqsave(&from->move_lock, flags);
   
-       if (!PageAnon(page) && page_mapped(page)) {
+       if (!anon && page_mapped(page)) {
                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                                nr_pages);
                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
                                nr_pages);
         }
   
+       /*
+        * move_lock grabbed above and caller set from->moving_account, so
+        * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
+        * So mapping should be stable for dirty pages.
+        */
+       if (!anon && PageDirty(page)) {
+               struct address_space *mapping = page_mapping(page);
+ 
+               if (mapping_cap_account_dirty(mapping)) {
+                       __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+                       __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
+                                      nr_pages);
+               }
+       }
+ 
         if (PageWriteback(page)) {
                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
                                nr_pages);
@@@ -5306,6 -5414,7 +5425,7 @@@ static ssize_t memory_high_write(struc
   
         memcg->high = high;
   
+       memcg_wb_domain_size_changed(memcg);
         return nbytes;
   }
   
@@@ -5338,6 -5447,7 +5458,7 @@@ static ssize_t memory_max_write(struct 
         if (err)
                 return err;
   
+       memcg_wb_domain_size_changed(memcg);
         return nbytes;
   }
   
@@@ -5844,7 -5954,9 +5965,7 @@@ void mem_cgroup_swapout(struct page *pa
         if (!mem_cgroup_is_root(memcg))
                 page_counter_uncharge(&memcg->memory, 1);
   
- -      /* XXX: caller holds IRQ-safe mapping->tree_lock */
- -      VM_BUG_ON(!irqs_disabled());
- -
+ +      /* Caller disabled preemption with mapping->tree_lock */
         mem_cgroup_charge_statistics(memcg, page, -1);
         memcg_check_events(memcg, page);
   }
diff --combined mm/page-writeback.c

index eb59f7eea50827fc09e1c4f7a432b59ff2241d17,e1514d5b4e9bf62b5ae3552fb6f35900776985d9..22cddd3e5de8433952e99438d3260ae9ff20bd8d
--- 1/mm/page-writeback.c
--- 2/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@@ -122,31 -122,31 +122,31 @@@ EXPORT_SYMBOL(laptop_mode)
   
   /* End of sysctl-exported parameters */
   
- unsigned long global_dirty_limit;
+ struct wb_domain global_wb_domain;
   
- /*
-  * Scale the writeback cache size proportional to the relative writeout speeds.
-  *
-  * We do this by keeping a floating proportion between BDIs, based on page
-  * writeback completions [end_page_writeback()]. Those devices that write out
-  * pages fastest will get the larger share, while the slower will get a smaller
-  * share.
-  *
-  * We use page writeout completions because we are interested in getting rid of
-  * dirty pages. Having them written out is the primary goal.
-  *
-  * We introduce a concept of time, a period over which we measure these events,
-  * because demand can/will vary over time. The length of this period itself is
-  * measured in page writeback completions.
-  *
-  */
- static struct fprop_global writeout_completions;
+ /* consolidated parameters for balance_dirty_pages() and its subroutines */
+ struct dirty_throttle_control {
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       struct wb_domain        *dom;
+       struct dirty_throttle_control *gdtc;    /* only set in memcg dtc's */
+ #endif
+       struct bdi_writeback    *wb;
+       struct fprop_local_percpu *wb_completions;
   
- static void writeout_period(unsigned long t);
- /* Timer for aging of writeout_completions */
- static struct timer_list writeout_period_timer =
-               TIMER_DEFERRED_INITIALIZER(writeout_period, 0, 0);
- static unsigned long writeout_period_time = 0;
+       unsigned long           avail;          /* dirtyable */
+       unsigned long           dirty;          /* file_dirty + write + nfs */
+       unsigned long           thresh;         /* dirty threshold */
+       unsigned long           bg_thresh;      /* dirty background threshold */
+ 
+       unsigned long           wb_dirty;       /* per-wb counterparts */
+       unsigned long           wb_thresh;
+       unsigned long           wb_bg_thresh;
+ 
+       unsigned long           pos_ratio;
+ };
+ 
+ #define DTC_INIT_COMMON(__wb) .wb = (__wb),                           \
+                               .wb_completions = &(__wb)->completions
   
   /*
    * Length of period for aging writeout fractions of bdis. This is an
@@@ -155,6 -155,97 +155,97 @@@
    */
   #define VM_COMPLETIONS_PERIOD_LEN (3*HZ)
   
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ 
+ #define GDTC_INIT(__wb)               .dom = &global_wb_domain,               \
+                               DTC_INIT_COMMON(__wb)
+ #define GDTC_INIT_NO_WB               .dom = &global_wb_domain
+ #define MDTC_INIT(__wb, __gdtc)       .dom = mem_cgroup_wb_domain(__wb),      \
+                               .gdtc = __gdtc,                         \
+                               DTC_INIT_COMMON(__wb)
+ 
+ static bool mdtc_valid(struct dirty_throttle_control *dtc)
+ {
+       return dtc->dom;
+ }
+ 
+ static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+ {
+       return dtc->dom;
+ }
+ 
+ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+ {
+       return mdtc->gdtc;
+ }
+ 
+ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+ {
+       return &wb->memcg_completions;
+ }
+ 
+ static void wb_min_max_ratio(struct bdi_writeback *wb,
+                            unsigned long *minp, unsigned long *maxp)
+ {
+       unsigned long this_bw = wb->avg_write_bandwidth;
+       unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+       unsigned long long min = wb->bdi->min_ratio;
+       unsigned long long max = wb->bdi->max_ratio;
+ 
+       /*
+        * @wb may already be clean by the time control reaches here and
+        * the total may not include its bw.
+        */
+       if (this_bw < tot_bw) {
+               if (min) {
+                       min *= this_bw;
+                       do_div(min, tot_bw);
+               }
+               if (max < 100) {
+                       max *= this_bw;
+                       do_div(max, tot_bw);
+               }
+       }
+ 
+       *minp = min;
+       *maxp = max;
+ }
+ 
+ #else /* CONFIG_CGROUP_WRITEBACK */
+ 
+ #define GDTC_INIT(__wb)               DTC_INIT_COMMON(__wb)
+ #define GDTC_INIT_NO_WB
+ #define MDTC_INIT(__wb, __gdtc)
+ 
+ static bool mdtc_valid(struct dirty_throttle_control *dtc)
+ {
+       return false;
+ }
+ 
+ static struct wb_domain *dtc_dom(struct dirty_throttle_control *dtc)
+ {
+       return &global_wb_domain;
+ }
+ 
+ static struct dirty_throttle_control *mdtc_gdtc(struct dirty_throttle_control *mdtc)
+ {
+       return NULL;
+ }
+ 
+ static struct fprop_local_percpu *wb_memcg_completions(struct bdi_writeback *wb)
+ {
+       return NULL;
+ }
+ 
+ static void wb_min_max_ratio(struct bdi_writeback *wb,
+                            unsigned long *minp, unsigned long *maxp)
+ {
+       *minp = wb->bdi->min_ratio;
+       *maxp = wb->bdi->max_ratio;
+ }
+ 
+ #endif        /* CONFIG_CGROUP_WRITEBACK */
+ 
   /*
    * In a memory zone, there is a certain amount of pages we consider
    * available for the page cache, which is essentially the number of
@@@ -250,42 -341,88 +341,88 @@@ static unsigned long global_dirtyable_m
         return x + 1;   /* Ensure that we never return 0 */
   }
   
- /*
-  * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ /**
+  * domain_dirty_limits - calculate thresh and bg_thresh for a wb_domain
+  * @dtc: dirty_throttle_control of interest
    *
-  * Calculate the dirty thresholds based on sysctl parameters
-  * - vm.dirty_background_ratio  or  vm.dirty_background_bytes
-  * - vm.dirty_ratio             or  vm.dirty_bytes
-  * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+  * Calculate @dtc->thresh and ->bg_thresh considering
+  * vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}.  The caller
+  * must ensure that @dtc->avail is set before calling this function.  The
+  * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
    * real-time tasks.
    */
- void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
   {
-       const unsigned long available_memory = global_dirtyable_memory();
-       unsigned long background;
-       unsigned long dirty;
+       const unsigned long available_memory = dtc->avail;
+       struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc);
+       unsigned long bytes = vm_dirty_bytes;
+       unsigned long bg_bytes = dirty_background_bytes;
+       unsigned long ratio = vm_dirty_ratio;
+       unsigned long bg_ratio = dirty_background_ratio;
+       unsigned long thresh;
+       unsigned long bg_thresh;
         struct task_struct *tsk;
   
-       if (vm_dirty_bytes)
-               dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
+       /* gdtc is !NULL iff @dtc is for memcg domain */
+       if (gdtc) {
+               unsigned long global_avail = gdtc->avail;
+ 
+               /*
+                * The byte settings can't be applied directly to memcg
+                * domains.  Convert them to ratios by scaling against
+                * globally available memory.
+                */
+               if (bytes)
+                       ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 /
+                                   global_avail, 100UL);
+               if (bg_bytes)
+                       bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 /
+                                      global_avail, 100UL);
+               bytes = bg_bytes = 0;
+       }
+ 
+       if (bytes)
+               thresh = DIV_ROUND_UP(bytes, PAGE_SIZE);
         else
-               dirty = (vm_dirty_ratio * available_memory) / 100;
+               thresh = (ratio * available_memory) / 100;
   
-       if (dirty_background_bytes)
-               background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
+       if (bg_bytes)
+               bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE);
         else
-               background = (dirty_background_ratio * available_memory) / 100;
+               bg_thresh = (bg_ratio * available_memory) / 100;
   
-       if (background >= dirty)
-               background = dirty / 2;
+       if (bg_thresh >= thresh)
+               bg_thresh = thresh / 2;
         tsk = current;
         if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-               background += background / 4;
-               dirty += dirty / 4;
+               bg_thresh += bg_thresh / 4;
+               thresh += thresh / 4;
         }
-       *pbackground = background;
-       *pdirty = dirty;
-       trace_global_dirty_state(background, dirty);
+       dtc->thresh = thresh;
+       dtc->bg_thresh = bg_thresh;
+ 
+       /* we should eventually report the domain in the TP */
+       if (!gdtc)
+               trace_global_dirty_state(bg_thresh, thresh);
+ }
+ 
+ /**
+  * global_dirty_limits - background-writeback and dirty-throttling thresholds
+  * @pbackground: out parameter for bg_thresh
+  * @pdirty: out parameter for thresh
+  *
+  * Calculate bg_thresh and thresh for global_wb_domain.  See
+  * domain_dirty_limits() for details.
+  */
+ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
+ {
+       struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
+ 
+       gdtc.avail = global_dirtyable_memory();
+       domain_dirty_limits(&gdtc);
+ 
+       *pbackground = gdtc.bg_thresh;
+       *pdirty = gdtc.thresh;
   }
   
   /**
@@@ -392,47 -529,52 +529,52 @@@ static unsigned long wp_next_time(unsig
         return cur_time;
   }
   
- /*
-  * Increment the BDI's writeout completion count and the global writeout
-  * completion count. Called from test_clear_page_writeback().
-  */
- static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
+ static void wb_domain_writeout_inc(struct wb_domain *dom,
+                                  struct fprop_local_percpu *completions,
+                                  unsigned int max_prop_frac)
   {
-       __inc_bdi_stat(bdi, BDI_WRITTEN);
-       __fprop_inc_percpu_max(&writeout_completions, &bdi->completions,
-                              bdi->max_prop_frac);
+       __fprop_inc_percpu_max(&dom->completions, completions,
+                              max_prop_frac);
         /* First event after period switching was turned off? */
-       if (!unlikely(writeout_period_time)) {
+       if (!unlikely(dom->period_time)) {
                 /*
                  * We can race with other __bdi_writeout_inc calls here but
                  * it does not cause any harm since the resulting time when
                  * timer will fire and what is in writeout_period_time will be
                  * roughly the same.
                  */
-               writeout_period_time = wp_next_time(jiffies);
-               mod_timer(&writeout_period_timer, writeout_period_time);
+               dom->period_time = wp_next_time(jiffies);
+               mod_timer(&dom->period_timer, dom->period_time);
         }
   }
   
- void bdi_writeout_inc(struct backing_dev_info *bdi)
+ /*
+  * Increment @wb's writeout completion count and the global writeout
+  * completion count. Called from test_clear_page_writeback().
+  */
+ static inline void __wb_writeout_inc(struct bdi_writeback *wb)
   {
-       unsigned long flags;
+       struct wb_domain *cgdom;
   
-       local_irq_save(flags);
-       __bdi_writeout_inc(bdi);
-       local_irq_restore(flags);
+       __inc_wb_stat(wb, WB_WRITTEN);
+       wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
+                              wb->bdi->max_prop_frac);
+ 
+       cgdom = mem_cgroup_wb_domain(wb);
+       if (cgdom)
+               wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
+                                      wb->bdi->max_prop_frac);
   }
- EXPORT_SYMBOL_GPL(bdi_writeout_inc);
   
- /*
-  * Obtain an accurate fraction of the BDI's portion.
-  */
- static void bdi_writeout_fraction(struct backing_dev_info *bdi,
-               long *numerator, long *denominator)
+ void wb_writeout_inc(struct bdi_writeback *wb)
   {
-       fprop_fraction_percpu(&writeout_completions, &bdi->completions,
-                               numerator, denominator);
+       unsigned long flags;
+ 
+       local_irq_save(flags);
+       __wb_writeout_inc(wb);
+       local_irq_restore(flags);
   }
+ EXPORT_SYMBOL_GPL(wb_writeout_inc);
   
   /*
    * On idle system, we can be called long after we scheduled because we use
@@@ -440,22 -582,46 +582,46 @@@
    */
   static void writeout_period(unsigned long t)
   {
-       int miss_periods = (jiffies - writeout_period_time) /
+       struct wb_domain *dom = (void *)t;
+       int miss_periods = (jiffies - dom->period_time) /
                                                  VM_COMPLETIONS_PERIOD_LEN;
   
-       if (fprop_new_period(&writeout_completions, miss_periods + 1)) {
-               writeout_period_time = wp_next_time(writeout_period_time +
+       if (fprop_new_period(&dom->completions, miss_periods + 1)) {
+               dom->period_time = wp_next_time(dom->period_time +
                                 miss_periods * VM_COMPLETIONS_PERIOD_LEN);
-               mod_timer(&writeout_period_timer, writeout_period_time);
+               mod_timer(&dom->period_timer, dom->period_time);
         } else {
                 /*
                  * Aging has zeroed all fractions. Stop wasting CPU on period
                  * updates.
                  */
-               writeout_period_time = 0;
+               dom->period_time = 0;
         }
   }
   
+ int wb_domain_init(struct wb_domain *dom, gfp_t gfp)
+ {
+       memset(dom, 0, sizeof(*dom));
+ 
+       spin_lock_init(&dom->lock);
+ 
+       init_timer_deferrable(&dom->period_timer);
+       dom->period_timer.function = writeout_period;
+       dom->period_timer.data = (unsigned long)dom;
+ 
+       dom->dirty_limit_tstamp = jiffies;
+ 
+       return fprop_global_init(&dom->completions, gfp);
+ }
+ 
+ #ifdef CONFIG_CGROUP_WRITEBACK
+ void wb_domain_exit(struct wb_domain *dom)
+ {
+       del_timer_sync(&dom->period_timer);
+       fprop_global_destroy(&dom->completions);
+ }
+ #endif
+ 
   /*
    * bdi_min_ratio keeps the sum of the minimum dirty shares of all
    * registered backing devices, which, for obvious reasons, can not
@@@ -510,17 -676,26 +676,26 @@@ static unsigned long dirty_freerun_ceil
         return (thresh + bg_thresh) / 2;
   }
   
- static unsigned long hard_dirty_limit(unsigned long thresh)
+ static unsigned long hard_dirty_limit(struct wb_domain *dom,
+                                     unsigned long thresh)
   {
-       return max(thresh, global_dirty_limit);
+       return max(thresh, dom->dirty_limit);
+ }
+ 
+ /* memory available to a memcg domain is capped by system-wide clean memory */
+ static void mdtc_cap_avail(struct dirty_throttle_control *mdtc)
+ {
+       struct dirty_throttle_control *gdtc = mdtc_gdtc(mdtc);
+       unsigned long clean = gdtc->avail - min(gdtc->avail, gdtc->dirty);
+ 
+       mdtc->avail = min(mdtc->avail, clean);
   }
   
   /**
-  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
-  * @bdi: the backing_dev_info to query
-  * @dirty: global dirty limit in pages
+  * __wb_calc_thresh - @wb's share of dirty throttling threshold
+  * @dtc: dirty_throttle_context of interest
    *
-  * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+  * Returns @wb's dirty limit in pages. The term "dirty" in the context of
    * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
    *
    * Note that balance_dirty_pages() will only seriously take it as a hard limit
@@@ -528,34 -703,47 +703,47 @@@
    * control. For example, when the device is completely stalled due to some error
    * conditions, or when there are 1000 dd tasks writing to a slow 10MB/s USB key.
    * In the other normal situations, it acts more gently by throttling the tasks
-  * more (rather than completely block them) when the bdi dirty pages go high.
+  * more (rather than completely block them) when the wb dirty pages go high.
    *
    * It allocates high/low dirty limits to fast/slow devices, in order to prevent
    * - starving fast devices
    * - piling up dirty pages (that will take long time to sync) on slow devices
    *
-  * The bdi's share of dirty limit will be adapting to its throughput and
+  * The wb's share of dirty limit will be adapting to its throughput and
    * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
    */
- unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
   {
-       u64 bdi_dirty;
+       struct wb_domain *dom = dtc_dom(dtc);
+       unsigned long thresh = dtc->thresh;
+       u64 wb_thresh;
         long numerator, denominator;
+       unsigned long wb_min_ratio, wb_max_ratio;
   
         /*
-        * Calculate this BDI's share of the dirty ratio.
+        * Calculate this BDI's share of the thresh ratio.
          */
-       bdi_writeout_fraction(bdi, &numerator, &denominator);
+       fprop_fraction_percpu(&dom->completions, dtc->wb_completions,
+                             &numerator, &denominator);
+ 
+       wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100;
+       wb_thresh *= numerator;
+       do_div(wb_thresh, denominator);
   
-       bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
-       bdi_dirty *= numerator;
-       do_div(bdi_dirty, denominator);
+       wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio);
   
-       bdi_dirty += (dirty * bdi->min_ratio) / 100;
-       if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
-               bdi_dirty = dirty * bdi->max_ratio / 100;
+       wb_thresh += (thresh * wb_min_ratio) / 100;
+       if (wb_thresh > (thresh * wb_max_ratio) / 100)
+               wb_thresh = thresh * wb_max_ratio / 100;
   
-       return bdi_dirty;
+       return wb_thresh;
+ }
+ 
+ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
+ {
+       struct dirty_throttle_control gdtc = { GDTC_INIT(wb),
+                                              .thresh = thresh };
+       return __wb_calc_thresh(&gdtc);
   }
   
   /*
@@@ -580,7 -768,7 +768,7 @@@ static long long pos_ratio_polynom(unsi
         long x;
   
         x = div64_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT,
- -                  limit - setpoint + 1);
+ +                    (limit - setpoint) | 1);
         pos_ratio = x;
         pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
         pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT;
@@@ -594,7 -782,7 +782,7 @@@
    *
    * (o) global/bdi setpoints
    *
-  * We want the dirty pages be balanced around the global/bdi setpoints.
+  * We want the dirty pages be balanced around the global/wb setpoints.
    * When the number of dirty pages is higher/lower than the setpoint, the
    * dirty position control ratio (and hence task dirty ratelimit) will be
    * decreased/increased to bring the dirty pages back to the setpoint.
@@@ -604,8 -792,8 +792,8 @@@
    *     if (dirty < setpoint) scale up   pos_ratio
    *     if (dirty > setpoint) scale down pos_ratio
    *
-  *     if (bdi_dirty < bdi_setpoint) scale up   pos_ratio
-  *     if (bdi_dirty > bdi_setpoint) scale down pos_ratio
+  *     if (wb_dirty < wb_setpoint) scale up   pos_ratio
+  *     if (wb_dirty > wb_setpoint) scale down pos_ratio
    *
    *     task_ratelimit = dirty_ratelimit * pos_ratio >> RATELIMIT_CALC_SHIFT
    *
@@@ -630,7 -818,7 +818,7 @@@
    *   0 +------------.------------------.----------------------*------------->
    *           freerun^          setpoint^                 limit^   dirty pages
    *
-  * (o) bdi control line
+  * (o) wb control line
    *
    *     ^ pos_ratio
    *     |
@@@ -656,33 -844,32 +844,32 @@@
    *     |                      .                           .
    *     |                      .                             .
    *   0 +----------------------.-------------------------------.------------->
-  *                bdi_setpoint^                    x_intercept^
+  *                wb_setpoint^                    x_intercept^
    *
-  * The bdi control line won't drop below pos_ratio=1/4, so that bdi_dirty can
+  * The wb control line won't drop below pos_ratio=1/4, so that wb_dirty can
    * be smoothly throttled down to normal if it starts high in situations like
    * - start writing to a slow SD card and a fast disk at the same time. The SD
-  *   card's bdi_dirty may rush to many times higher than bdi_setpoint.
-  * - the bdi dirty thresh drops quickly due to change of JBOD workload
+  *   card's wb_dirty may rush to many times higher than wb_setpoint.
+  * - the wb dirty thresh drops quickly due to change of JBOD workload
    */
- static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
-                                       unsigned long thresh,
-                                       unsigned long bg_thresh,
-                                       unsigned long dirty,
-                                       unsigned long bdi_thresh,
-                                       unsigned long bdi_dirty)
- {
-       unsigned long write_bw = bdi->avg_write_bandwidth;
-       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-       unsigned long limit = hard_dirty_limit(thresh);
+ static void wb_position_ratio(struct dirty_throttle_control *dtc)
+ {
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+       unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
+       unsigned long wb_thresh = dtc->wb_thresh;
         unsigned long x_intercept;
         unsigned long setpoint;         /* dirty pages' target balance point */
-       unsigned long bdi_setpoint;
+       unsigned long wb_setpoint;
         unsigned long span;
         long long pos_ratio;            /* for scaling up/down the rate limit */
         long x;
   
-       if (unlikely(dirty >= limit))
-               return 0;
+       dtc->pos_ratio = 0;
+ 
+       if (unlikely(dtc->dirty >= limit))
+               return;
   
         /*
          * global setpoint
@@@ -690,165 -877,167 +877,167 @@@
          * See comment for pos_ratio_polynom().
          */
         setpoint = (freerun + limit) / 2;
-       pos_ratio = pos_ratio_polynom(setpoint, dirty, limit);
+       pos_ratio = pos_ratio_polynom(setpoint, dtc->dirty, limit);
   
         /*
          * The strictlimit feature is a tool preventing mistrusted filesystems
          * from growing a large number of dirty pages before throttling. For
-        * such filesystems balance_dirty_pages always checks bdi counters
-        * against bdi limits. Even if global "nr_dirty" is under "freerun".
+        * such filesystems balance_dirty_pages always checks wb counters
+        * against wb limits. Even if global "nr_dirty" is under "freerun".
          * This is especially important for fuse which sets bdi->max_ratio to
          * 1% by default. Without strictlimit feature, fuse writeback may
          * consume arbitrary amount of RAM because it is accounted in
          * NR_WRITEBACK_TEMP which is not involved in calculating "nr_dirty".
          *
-        * Here, in bdi_position_ratio(), we calculate pos_ratio based on
-        * two values: bdi_dirty and bdi_thresh. Let's consider an example:
+        * Here, in wb_position_ratio(), we calculate pos_ratio based on
+        * two values: wb_dirty and wb_thresh. Let's consider an example:
          * total amount of RAM is 16GB, bdi->max_ratio is equal to 1%, global
          * limits are set by default to 10% and 20% (background and throttle).
-        * Then bdi_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
-        * bdi_dirty_limit(bdi, bg_thresh) is about ~4K pages. bdi_setpoint is
-        * about ~6K pages (as the average of background and throttle bdi
+        * Then wb_thresh is 1% of 20% of 16GB. This amounts to ~8K pages.
+        * wb_calc_thresh(wb, bg_thresh) is about ~4K pages. wb_setpoint is
+        * about ~6K pages (as the average of background and throttle wb
          * limits). The 3rd order polynomial will provide positive feedback if
-        * bdi_dirty is under bdi_setpoint and vice versa.
+        * wb_dirty is under wb_setpoint and vice versa.
          *
          * Note, that we cannot use global counters in these calculations
-        * because we want to throttle process writing to a strictlimit BDI
+        * because we want to throttle process writing to a strictlimit wb
          * much earlier than global "freerun" is reached (~23MB vs. ~2.3GB
          * in the example above).
          */
-       if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-               long long bdi_pos_ratio;
-               unsigned long bdi_bg_thresh;
+       if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+               long long wb_pos_ratio;
   
-               if (bdi_dirty < 8)
-                       return min_t(long long, pos_ratio * 2,
-                                    2 << RATELIMIT_CALC_SHIFT);
+               if (dtc->wb_dirty < 8) {
+                       dtc->pos_ratio = min_t(long long, pos_ratio * 2,
+                                          2 << RATELIMIT_CALC_SHIFT);
+                       return;
+               }
   
-               if (bdi_dirty >= bdi_thresh)
-                       return 0;
+               if (dtc->wb_dirty >= wb_thresh)
+                       return;
   
-               bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
-               bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
-                                                    bdi_bg_thresh);
+               wb_setpoint = dirty_freerun_ceiling(wb_thresh,
+                                                   dtc->wb_bg_thresh);
   
-               if (bdi_setpoint == 0 || bdi_setpoint == bdi_thresh)
-                       return 0;
+               if (wb_setpoint == 0 || wb_setpoint == wb_thresh)
+                       return;
   
-               bdi_pos_ratio = pos_ratio_polynom(bdi_setpoint, bdi_dirty,
-                                                 bdi_thresh);
+               wb_pos_ratio = pos_ratio_polynom(wb_setpoint, dtc->wb_dirty,
+                                                wb_thresh);
   
                 /*
-                * Typically, for strictlimit case, bdi_setpoint << setpoint
-                * and pos_ratio >> bdi_pos_ratio. In the other words global
+                * Typically, for strictlimit case, wb_setpoint << setpoint
+                * and pos_ratio >> wb_pos_ratio. In the other words global
                  * state ("dirty") is not limiting factor and we have to
-                * make decision based on bdi counters. But there is an
+                * make decision based on wb counters. But there is an
                  * important case when global pos_ratio should get precedence:
                  * global limits are exceeded (e.g. due to activities on other
-                * BDIs) while given strictlimit BDI is below limit.
+                * wb's) while given strictlimit wb is below limit.
                  *
-                * "pos_ratio * bdi_pos_ratio" would work for the case above,
+                * "pos_ratio * wb_pos_ratio" would work for the case above,
                  * but it would look too non-natural for the case of all
-                * activity in the system coming from a single strictlimit BDI
+                * activity in the system coming from a single strictlimit wb
                  * with bdi->max_ratio == 100%.
                  *
                  * Note that min() below somewhat changes the dynamics of the
                  * control system. Normally, pos_ratio value can be well over 3
-                * (when globally we are at freerun and bdi is well below bdi
+                * (when globally we are at freerun and wb is well below wb
                  * setpoint). Now the maximum pos_ratio in the same situation
                  * is 2. We might want to tweak this if we observe the control
                  * system is too slow to adapt.
                  */
-               return min(pos_ratio, bdi_pos_ratio);
+               dtc->pos_ratio = min(pos_ratio, wb_pos_ratio);
+               return;
         }
   
         /*
          * We have computed basic pos_ratio above based on global situation. If
-        * the bdi is over/under its share of dirty pages, we want to scale
+        * the wb is over/under its share of dirty pages, we want to scale
          * pos_ratio further down/up. That is done by the following mechanism.
          */
   
         /*
-        * bdi setpoint
+        * wb setpoint
          *
-        *        f(bdi_dirty) := 1.0 + k * (bdi_dirty - bdi_setpoint)
+        *        f(wb_dirty) := 1.0 + k * (wb_dirty - wb_setpoint)
          *
-        *                        x_intercept - bdi_dirty
+        *                        x_intercept - wb_dirty
          *                     := --------------------------
-        *                        x_intercept - bdi_setpoint
+        *                        x_intercept - wb_setpoint
          *
-        * The main bdi control line is a linear function that subjects to
+        * The main wb control line is a linear function that subjects to
          *
-        * (1) f(bdi_setpoint) = 1.0
-        * (2) k = - 1 / (8 * write_bw)  (in single bdi case)
-        *     or equally: x_intercept = bdi_setpoint + 8 * write_bw
+        * (1) f(wb_setpoint) = 1.0
+        * (2) k = - 1 / (8 * write_bw)  (in single wb case)
+        *     or equally: x_intercept = wb_setpoint + 8 * write_bw
          *
-        * For single bdi case, the dirty pages are observed to fluctuate
+        * For single wb case, the dirty pages are observed to fluctuate
          * regularly within range
-        *        [bdi_setpoint - write_bw/2, bdi_setpoint + write_bw/2]
+        *        [wb_setpoint - write_bw/2, wb_setpoint + write_bw/2]
          * for various filesystems, where (2) can yield in a reasonable 12.5%
          * fluctuation range for pos_ratio.
          *
-        * For JBOD case, bdi_thresh (not bdi_dirty!) could fluctuate up to its
+        * For JBOD case, wb_thresh (not wb_dirty!) could fluctuate up to its
          * own size, so move the slope over accordingly and choose a slope that
-        * yields 100% pos_ratio fluctuation on suddenly doubled bdi_thresh.
+        * yields 100% pos_ratio fluctuation on suddenly doubled wb_thresh.
          */
-       if (unlikely(bdi_thresh > thresh))
-               bdi_thresh = thresh;
+       if (unlikely(wb_thresh > dtc->thresh))
+               wb_thresh = dtc->thresh;
         /*
-        * It's very possible that bdi_thresh is close to 0 not because the
+        * It's very possible that wb_thresh is close to 0 not because the
          * device is slow, but that it has remained inactive for long time.
          * Honour such devices a reasonable good (hopefully IO efficient)
          * threshold, so that the occasional writes won't be blocked and active
          * writes can rampup the threshold quickly.
          */
-       bdi_thresh = max(bdi_thresh, (limit - dirty) / 8);
+       wb_thresh = max(wb_thresh, (limit - dtc->dirty) / 8);
         /*
-        * scale global setpoint to bdi's:
-        *      bdi_setpoint = setpoint * bdi_thresh / thresh
+        * scale global setpoint to wb's:
+        *      wb_setpoint = setpoint * wb_thresh / thresh
          */
-       x = div_u64((u64)bdi_thresh << 16, thresh | 1);
-       bdi_setpoint = setpoint * (u64)x >> 16;
- -      x = div_u64((u64)wb_thresh << 16, dtc->thresh + 1);
++      x = div_u64((u64)wb_thresh << 16, dtc->thresh | 1);
+       wb_setpoint = setpoint * (u64)x >> 16;
         /*
-        * Use span=(8*write_bw) in single bdi case as indicated by
-        * (thresh - bdi_thresh ~= 0) and transit to bdi_thresh in JBOD case.
+        * Use span=(8*write_bw) in single wb case as indicated by
+        * (thresh - wb_thresh ~= 0) and transit to wb_thresh in JBOD case.
          *
-        *        bdi_thresh                    thresh - bdi_thresh
-        * span = ---------- * (8 * write_bw) + ------------------- * bdi_thresh
-        *          thresh                            thresh
+        *        wb_thresh                    thresh - wb_thresh
+        * span = --------- * (8 * write_bw) + ------------------ * wb_thresh
+        *         thresh                           thresh
          */
-       span = (thresh - bdi_thresh + 8 * write_bw) * (u64)x >> 16;
-       x_intercept = bdi_setpoint + span;
+       span = (dtc->thresh - wb_thresh + 8 * write_bw) * (u64)x >> 16;
+       x_intercept = wb_setpoint + span;
   
-       if (bdi_dirty < x_intercept - span / 4) {
-               pos_ratio = div64_u64(pos_ratio * (x_intercept - bdi_dirty),
-                                     (x_intercept - bdi_setpoint) | 1);
+       if (dtc->wb_dirty < x_intercept - span / 4) {
+               pos_ratio = div64_u64(pos_ratio * (x_intercept - dtc->wb_dirty),
- -                                    x_intercept - wb_setpoint + 1);
++                                    (x_intercept - wb_setpoint) | 1);
         } else
                 pos_ratio /= 4;
   
         /*
-        * bdi reserve area, safeguard against dirty pool underrun and disk idle
+        * wb reserve area, safeguard against dirty pool underrun and disk idle
          * It may push the desired control point of global dirty pages higher
          * than setpoint.
          */
-       x_intercept = bdi_thresh / 2;
-       if (bdi_dirty < x_intercept) {
-               if (bdi_dirty > x_intercept / 8)
-                       pos_ratio = div_u64(pos_ratio * x_intercept, bdi_dirty);
+       x_intercept = wb_thresh / 2;
+       if (dtc->wb_dirty < x_intercept) {
+               if (dtc->wb_dirty > x_intercept / 8)
+                       pos_ratio = div_u64(pos_ratio * x_intercept,
+                                           dtc->wb_dirty);
                 else
                         pos_ratio *= 8;
         }
   
-       return pos_ratio;
+       dtc->pos_ratio = pos_ratio;
   }
   
- static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
-                                      unsigned long elapsed,
-                                      unsigned long written)
+ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
+                                     unsigned long elapsed,
+                                     unsigned long written)
   {
         const unsigned long period = roundup_pow_of_two(3 * HZ);
-       unsigned long avg = bdi->avg_write_bandwidth;
-       unsigned long old = bdi->write_bandwidth;
+       unsigned long avg = wb->avg_write_bandwidth;
+       unsigned long old = wb->write_bandwidth;
         u64 bw;
   
         /*
@@@ -861,14 -1050,14 +1050,14 @@@
          * @written may have decreased due to account_page_redirty().
          * Avoid underflowing @bw calculation.
          */
-       bw = written - min(written, bdi->written_stamp);
+       bw = written - min(written, wb->written_stamp);
         bw *= HZ;
         if (unlikely(elapsed > period)) {
                 do_div(bw, elapsed);
                 avg = bw;
                 goto out;
         }
-       bw += (u64)bdi->write_bandwidth * (period - elapsed);
+       bw += (u64)wb->write_bandwidth * (period - elapsed);
         bw >>= ilog2(period);
   
         /*
@@@ -881,21 -1070,22 +1070,22 @@@
                 avg += (old - avg) >> 3;
   
   out:
-       bdi->write_bandwidth = bw;
-       bdi->avg_write_bandwidth = avg;
+       /* keep avg > 0 to guarantee that tot > 0 if there are dirty wbs */
+       avg = max(avg, 1LU);
+       if (wb_has_dirty_io(wb)) {
+               long delta = avg - wb->avg_write_bandwidth;
+               WARN_ON_ONCE(atomic_long_add_return(delta,
+                                       &wb->bdi->tot_write_bandwidth) <= 0);
+       }
+       wb->write_bandwidth = bw;
+       wb->avg_write_bandwidth = avg;
   }
   
- /*
-  * The global dirtyable memory and dirty threshold could be suddenly knocked
-  * down by a large amount (eg. on the startup of KVM in a swapless system).
-  * This may throw the system into deep dirty exceeded state and throttle
-  * heavy/light dirtiers alike. To retain good responsiveness, maintain
-  * global_dirty_limit for tracking slowly down to the knocked down dirty
-  * threshold.
-  */
- static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+ static void update_dirty_limit(struct dirty_throttle_control *dtc)
   {
-       unsigned long limit = global_dirty_limit;
+       struct wb_domain *dom = dtc_dom(dtc);
+       unsigned long thresh = dtc->thresh;
+       unsigned long limit = dom->dirty_limit;
   
         /*
          * Follow up in one step.
@@@ -908,63 -1098,57 +1098,57 @@@
         /*
          * Follow down slowly. Use the higher one as the target, because thresh
          * may drop below dirty. This is exactly the reason to introduce
-        * global_dirty_limit which is guaranteed to lie above the dirty pages.
+        * dom->dirty_limit which is guaranteed to lie above the dirty pages.
          */
-       thresh = max(thresh, dirty);
+       thresh = max(thresh, dtc->dirty);
         if (limit > thresh) {
                 limit -= (limit - thresh) >> 5;
                 goto update;
         }
         return;
   update:
-       global_dirty_limit = limit;
+       dom->dirty_limit = limit;
   }
   
- static void global_update_bandwidth(unsigned long thresh,
-                                   unsigned long dirty,
+ static void domain_update_bandwidth(struct dirty_throttle_control *dtc,
                                     unsigned long now)
   {
-       static DEFINE_SPINLOCK(dirty_lock);
-       static unsigned long update_time = INITIAL_JIFFIES;
+       struct wb_domain *dom = dtc_dom(dtc);
   
         /*
          * check locklessly first to optimize away locking for the most time
          */
-       if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+       if (time_before(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL))
                 return;
   
-       spin_lock(&dirty_lock);
-       if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
-               update_dirty_limit(thresh, dirty);
-               update_time = now;
+       spin_lock(&dom->lock);
+       if (time_after_eq(now, dom->dirty_limit_tstamp + BANDWIDTH_INTERVAL)) {
+               update_dirty_limit(dtc);
+               dom->dirty_limit_tstamp = now;
         }
-       spin_unlock(&dirty_lock);
+       spin_unlock(&dom->lock);
   }
   
   /*
-  * Maintain bdi->dirty_ratelimit, the base dirty throttle rate.
+  * Maintain wb->dirty_ratelimit, the base dirty throttle rate.
    *
-  * Normal bdi tasks will be curbed at or below it in long term.
+  * Normal wb tasks will be curbed at or below it in long term.
    * Obviously it should be around (write_bw / N) when there are N dd tasks.
    */
- static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
-                                      unsigned long thresh,
-                                      unsigned long bg_thresh,
-                                      unsigned long dirty,
-                                      unsigned long bdi_thresh,
-                                      unsigned long bdi_dirty,
-                                      unsigned long dirtied,
-                                      unsigned long elapsed)
- {
-       unsigned long freerun = dirty_freerun_ceiling(thresh, bg_thresh);
-       unsigned long limit = hard_dirty_limit(thresh);
+ static void wb_update_dirty_ratelimit(struct dirty_throttle_control *dtc,
+                                     unsigned long dirtied,
+                                     unsigned long elapsed)
+ {
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long dirty = dtc->dirty;
+       unsigned long freerun = dirty_freerun_ceiling(dtc->thresh, dtc->bg_thresh);
+       unsigned long limit = hard_dirty_limit(dtc_dom(dtc), dtc->thresh);
         unsigned long setpoint = (freerun + limit) / 2;
-       unsigned long write_bw = bdi->avg_write_bandwidth;
-       unsigned long dirty_ratelimit = bdi->dirty_ratelimit;
+       unsigned long write_bw = wb->avg_write_bandwidth;
+       unsigned long dirty_ratelimit = wb->dirty_ratelimit;
         unsigned long dirty_rate;
         unsigned long task_ratelimit;
         unsigned long balanced_dirty_ratelimit;
-       unsigned long pos_ratio;
         unsigned long step;
         unsigned long x;
   
@@@ -972,20 -1156,18 +1156,18 @@@
          * The dirty rate will match the writeout rate in long term, except
          * when dirty pages are truncated by userspace or re-dirtied by FS.
          */
-       dirty_rate = (dirtied - bdi->dirtied_stamp) * HZ / elapsed;
+       dirty_rate = (dirtied - wb->dirtied_stamp) * HZ / elapsed;
   
-       pos_ratio = bdi_position_ratio(bdi, thresh, bg_thresh, dirty,
-                                      bdi_thresh, bdi_dirty);
         /*
          * task_ratelimit reflects each dd's dirty rate for the past 200ms.
          */
         task_ratelimit = (u64)dirty_ratelimit *
-                                       pos_ratio >> RATELIMIT_CALC_SHIFT;
+                                       dtc->pos_ratio >> RATELIMIT_CALC_SHIFT;
         task_ratelimit++; /* it helps rampup dirty_ratelimit from tiny values */
   
         /*
          * A linear estimation of the "balanced" throttle rate. The theory is,
-        * if there are N dd tasks, each throttled at task_ratelimit, the bdi's
+        * if there are N dd tasks, each throttled at task_ratelimit, the wb's
          * dirty_rate will be measured to be (N * task_ratelimit). So the below
          * formula will yield the balanced rate limit (write_bw / N).
          *
@@@ -1024,7 -1206,7 +1206,7 @@@
         /*
          * We could safely do this and return immediately:
          *
-        *      bdi->dirty_ratelimit = balanced_dirty_ratelimit;
+        *      wb->dirty_ratelimit = balanced_dirty_ratelimit;
          *
          * However to get a more stable dirty_ratelimit, the below elaborated
          * code makes use of task_ratelimit to filter out singular points and
@@@ -1058,32 -1240,31 +1240,31 @@@
         step = 0;
   
         /*
-        * For strictlimit case, calculations above were based on bdi counters
-        * and limits (starting from pos_ratio = bdi_position_ratio() and up to
+        * For strictlimit case, calculations above were based on wb counters
+        * and limits (starting from pos_ratio = wb_position_ratio() and up to
          * balanced_dirty_ratelimit = task_ratelimit * write_bw / dirty_rate).
-        * Hence, to calculate "step" properly, we have to use bdi_dirty as
-        * "dirty" and bdi_setpoint as "setpoint".
+        * Hence, to calculate "step" properly, we have to use wb_dirty as
+        * "dirty" and wb_setpoint as "setpoint".
          *
-        * We rampup dirty_ratelimit forcibly if bdi_dirty is low because
-        * it's possible that bdi_thresh is close to zero due to inactivity
-        * of backing device (see the implementation of bdi_dirty_limit()).
+        * We rampup dirty_ratelimit forcibly if wb_dirty is low because
+        * it's possible that wb_thresh is close to zero due to inactivity
+        * of backing device.
          */
-       if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
-               dirty = bdi_dirty;
-               if (bdi_dirty < 8)
-                       setpoint = bdi_dirty + 1;
+       if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+               dirty = dtc->wb_dirty;
+               if (dtc->wb_dirty < 8)
+                       setpoint = dtc->wb_dirty + 1;
                 else
-                       setpoint = (bdi_thresh +
-                                   bdi_dirty_limit(bdi, bg_thresh)) / 2;
+                       setpoint = (dtc->wb_thresh + dtc->wb_bg_thresh) / 2;
         }
   
         if (dirty < setpoint) {
-               x = min3(bdi->balanced_dirty_ratelimit,
+               x = min3(wb->balanced_dirty_ratelimit,
                          balanced_dirty_ratelimit, task_ratelimit);
                 if (dirty_ratelimit < x)
                         step = x - dirty_ratelimit;
         } else {
-               x = max3(bdi->balanced_dirty_ratelimit,
+               x = max3(wb->balanced_dirty_ratelimit,
                          balanced_dirty_ratelimit, task_ratelimit);
                 if (dirty_ratelimit > x)
                         step = dirty_ratelimit - x;
@@@ -1105,69 -1286,67 +1286,67 @@@
         else
                 dirty_ratelimit -= step;
   
-       bdi->dirty_ratelimit = max(dirty_ratelimit, 1UL);
-       bdi->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
+       wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
+       wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
   
-       trace_bdi_dirty_ratelimit(bdi, dirty_rate, task_ratelimit);
+       trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
   }
   
- void __bdi_update_bandwidth(struct backing_dev_info *bdi,
-                           unsigned long thresh,
-                           unsigned long bg_thresh,
-                           unsigned long dirty,
-                           unsigned long bdi_thresh,
-                           unsigned long bdi_dirty,
-                           unsigned long start_time)
+ static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
+                                 struct dirty_throttle_control *mdtc,
+                                 unsigned long start_time,
+                                 bool update_ratelimit)
   {
+       struct bdi_writeback *wb = gdtc->wb;
         unsigned long now = jiffies;
-       unsigned long elapsed = now - bdi->bw_time_stamp;
+       unsigned long elapsed = now - wb->bw_time_stamp;
         unsigned long dirtied;
         unsigned long written;
   
+       lockdep_assert_held(&wb->list_lock);
+ 
         /*
          * rate-limit, only update once every 200ms.
          */
         if (elapsed < BANDWIDTH_INTERVAL)
                 return;
   
-       dirtied = percpu_counter_read(&bdi->bdi_stat[BDI_DIRTIED]);
-       written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+       dirtied = percpu_counter_read(&wb->stat[WB_DIRTIED]);
+       written = percpu_counter_read(&wb->stat[WB_WRITTEN]);
   
         /*
          * Skip quiet periods when disk bandwidth is under-utilized.
          * (at least 1s idle time between two flusher runs)
          */
-       if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+       if (elapsed > HZ && time_before(wb->bw_time_stamp, start_time))
                 goto snapshot;
   
-       if (thresh) {
-               global_update_bandwidth(thresh, dirty, now);
-               bdi_update_dirty_ratelimit(bdi, thresh, bg_thresh, dirty,
-                                          bdi_thresh, bdi_dirty,
-                                          dirtied, elapsed);
+       if (update_ratelimit) {
+               domain_update_bandwidth(gdtc, now);
+               wb_update_dirty_ratelimit(gdtc, dirtied, elapsed);
+ 
+               /*
+                * @mdtc is always NULL if !CGROUP_WRITEBACK but the
+                * compiler has no way to figure that out.  Help it.
+                */
+               if (IS_ENABLED(CONFIG_CGROUP_WRITEBACK) && mdtc) {
+                       domain_update_bandwidth(mdtc, now);
+                       wb_update_dirty_ratelimit(mdtc, dirtied, elapsed);
+               }
         }
-       bdi_update_write_bandwidth(bdi, elapsed, written);
+       wb_update_write_bandwidth(wb, elapsed, written);
   
   snapshot:
-       bdi->dirtied_stamp = dirtied;
-       bdi->written_stamp = written;
-       bdi->bw_time_stamp = now;
+       wb->dirtied_stamp = dirtied;
+       wb->written_stamp = written;
+       wb->bw_time_stamp = now;
   }
   
- static void bdi_update_bandwidth(struct backing_dev_info *bdi,
-                                unsigned long thresh,
-                                unsigned long bg_thresh,
-                                unsigned long dirty,
-                                unsigned long bdi_thresh,
-                                unsigned long bdi_dirty,
-                                unsigned long start_time)
+ void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time)
   {
-       if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
-               return;
-       spin_lock(&bdi->wb.list_lock);
-       __bdi_update_bandwidth(bdi, thresh, bg_thresh, dirty,
-                              bdi_thresh, bdi_dirty, start_time);
-       spin_unlock(&bdi->wb.list_lock);
+       struct dirty_throttle_control gdtc = { GDTC_INIT(wb) };
+ 
+       __wb_update_bandwidth(&gdtc, NULL, start_time, false);
   }
   
   /*
@@@ -1187,10 -1366,10 +1366,10 @@@ static unsigned long dirty_poll_interva
         return 1;
   }
   
- static unsigned long bdi_max_pause(struct backing_dev_info *bdi,
-                                  unsigned long bdi_dirty)
+ static unsigned long wb_max_pause(struct bdi_writeback *wb,
+                                 unsigned long wb_dirty)
   {
-       unsigned long bw = bdi->avg_write_bandwidth;
+       unsigned long bw = wb->avg_write_bandwidth;
         unsigned long t;
   
         /*
@@@ -1200,20 -1379,20 +1379,20 @@@
          *
          * 8 serves as the safety ratio.
          */
-       t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
+       t = wb_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8));
         t++;
   
         return min_t(unsigned long, t, MAX_PAUSE);
   }
   
- static long bdi_min_pause(struct backing_dev_info *bdi,
-                         long max_pause,
-                         unsigned long task_ratelimit,
-                         unsigned long dirty_ratelimit,
-                         int *nr_dirtied_pause)
+ static long wb_min_pause(struct bdi_writeback *wb,
+                        long max_pause,
+                        unsigned long task_ratelimit,
+                        unsigned long dirty_ratelimit,
+                        int *nr_dirtied_pause)
   {
-       long hi = ilog2(bdi->avg_write_bandwidth);
-       long lo = ilog2(bdi->dirty_ratelimit);
+       long hi = ilog2(wb->avg_write_bandwidth);
+       long lo = ilog2(wb->dirty_ratelimit);
         long t;         /* target pause */
         long pause;     /* estimated next pause */
         int pages;      /* target nr_dirtied_pause */
@@@ -1281,34 -1460,27 +1460,27 @@@
         return pages >= DIRTY_POLL_THRESH ? 1 + t / 2 : t;
   }
   
- static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
-                                   unsigned long dirty_thresh,
-                                   unsigned long background_thresh,
-                                   unsigned long *bdi_dirty,
-                                   unsigned long *bdi_thresh,
-                                   unsigned long *bdi_bg_thresh)
+ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc)
   {
-       unsigned long bdi_reclaimable;
+       struct bdi_writeback *wb = dtc->wb;
+       unsigned long wb_reclaimable;
   
         /*
-        * bdi_thresh is not treated as some limiting factor as
+        * wb_thresh is not treated as some limiting factor as
          * dirty_thresh, due to reasons
-        * - in JBOD setup, bdi_thresh can fluctuate a lot
+        * - in JBOD setup, wb_thresh can fluctuate a lot
          * - in a system with HDD and USB key, the USB key may somehow
-        *   go into state (bdi_dirty >> bdi_thresh) either because
-        *   bdi_dirty starts high, or because bdi_thresh drops low.
+        *   go into state (wb_dirty >> wb_thresh) either because
+        *   wb_dirty starts high, or because wb_thresh drops low.
          *   In this case we don't want to hard throttle the USB key
-        *   dirtiers for 100 seconds until bdi_dirty drops under
-        *   bdi_thresh. Instead the auxiliary bdi control line in
-        *   bdi_position_ratio() will let the dirtier task progress
-        *   at some rate <= (write_bw / 2) for bringing down bdi_dirty.
+        *   dirtiers for 100 seconds until wb_dirty drops under
+        *   wb_thresh. Instead the auxiliary wb control line in
+        *   wb_position_ratio() will let the dirtier task progress
+        *   at some rate <= (write_bw / 2) for bringing down wb_dirty.
          */
-       *bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
- 
-       if (bdi_bg_thresh)
-               *bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
-                                                       background_thresh,
-                                                       dirty_thresh) : 0;
+       dtc->wb_thresh = __wb_calc_thresh(dtc);
+       dtc->wb_bg_thresh = dtc->thresh ?
+               div_u64((u64)dtc->wb_thresh * dtc->bg_thresh, dtc->thresh) : 0;
   
         /*
          * In order to avoid the stacked BDI deadlock we need
@@@ -1320,14 -1492,12 +1492,12 @@@
          * actually dirty; with m+n sitting in the percpu
          * deltas.
          */
-       if (*bdi_thresh < 2 * bdi_stat_error(bdi)) {
-               bdi_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-               *bdi_dirty = bdi_reclaimable +
-                       bdi_stat_sum(bdi, BDI_WRITEBACK);
+       if (dtc->wb_thresh < 2 * wb_stat_error(wb)) {
+               wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE);
+               dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK);
         } else {
-               bdi_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-               *bdi_dirty = bdi_reclaimable +
-                       bdi_stat(bdi, BDI_WRITEBACK);
+               wb_reclaimable = wb_stat(wb, WB_RECLAIMABLE);
+               dtc->wb_dirty = wb_reclaimable + wb_stat(wb, WB_WRITEBACK);
         }
   }
   
@@@ -1339,12 -1509,16 +1509,16 @@@
    * perform some writeout.
    */
   static void balance_dirty_pages(struct address_space *mapping,
+                               struct bdi_writeback *wb,
                                 unsigned long pages_dirtied)
   {
+       struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+       struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+       struct dirty_throttle_control * const gdtc = &gdtc_stor;
+       struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+                                                    &mdtc_stor : NULL;
+       struct dirty_throttle_control *sdtc;
         unsigned long nr_reclaimable;   /* = file_dirty + unstable_nfs */
-       unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
-       unsigned long background_thresh;
-       unsigned long dirty_thresh;
         long period;
         long pause;
         long max_pause;
@@@ -1353,18 -1527,14 +1527,14 @@@
         bool dirty_exceeded = false;
         unsigned long task_ratelimit;
         unsigned long dirty_ratelimit;
-       unsigned long pos_ratio;
-       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+       struct backing_dev_info *bdi = wb->bdi;
         bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
         unsigned long start_time = jiffies;
   
         for (;;) {
                 unsigned long now = jiffies;
-               unsigned long uninitialized_var(bdi_thresh);
-               unsigned long thresh;
-               unsigned long uninitialized_var(bdi_dirty);
-               unsigned long dirty;
-               unsigned long bg_thresh;
+               unsigned long dirty, thresh, bg_thresh;
+               unsigned long m_dirty, m_thresh, m_bg_thresh;
   
                 /*
                  * Unstable writes are a feature of certain networked
@@@ -1374,65 -1544,127 +1544,127 @@@
                  */
                 nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
                                         global_page_state(NR_UNSTABLE_NFS);
-               nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
+               gdtc->avail = global_dirtyable_memory();
+               gdtc->dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
   
-               global_dirty_limits(&background_thresh, &dirty_thresh);
+               domain_dirty_limits(gdtc);
   
                 if (unlikely(strictlimit)) {
-                       bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-                                        &bdi_dirty, &bdi_thresh, &bg_thresh);
+                       wb_dirty_limits(gdtc);
   
-                       dirty = bdi_dirty;
-                       thresh = bdi_thresh;
+                       dirty = gdtc->wb_dirty;
+                       thresh = gdtc->wb_thresh;
+                       bg_thresh = gdtc->wb_bg_thresh;
                 } else {
-                       dirty = nr_dirty;
-                       thresh = dirty_thresh;
-                       bg_thresh = background_thresh;
+                       dirty = gdtc->dirty;
+                       thresh = gdtc->thresh;
+                       bg_thresh = gdtc->bg_thresh;
+               }
+ 
+               if (mdtc) {
+                       unsigned long writeback;
+ 
+                       /*
+                        * If @wb belongs to !root memcg, repeat the same
+                        * basic calculations for the memcg domain.
+                        */
+                       mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty,
+                                           &writeback);
+                       mdtc_cap_avail(mdtc);
+                       mdtc->dirty += writeback;
+ 
+                       domain_dirty_limits(mdtc);
+ 
+                       if (unlikely(strictlimit)) {
+                               wb_dirty_limits(mdtc);
+                               m_dirty = mdtc->wb_dirty;
+                               m_thresh = mdtc->wb_thresh;
+                               m_bg_thresh = mdtc->wb_bg_thresh;
+                       } else {
+                               m_dirty = mdtc->dirty;
+                               m_thresh = mdtc->thresh;
+                               m_bg_thresh = mdtc->bg_thresh;
+                       }
                 }
   
                 /*
                  * Throttle it only when the background writeback cannot
                  * catch-up. This avoids (excessively) small writeouts
-                * when the bdi limits are ramping up in case of !strictlimit.
+                * when the wb limits are ramping up in case of !strictlimit.
                  *
-                * In strictlimit case make decision based on the bdi counters
-                * and limits. Small writeouts when the bdi limits are ramping
+                * In strictlimit case make decision based on the wb counters
+                * and limits. Small writeouts when the wb limits are ramping
                  * up are the price we consciously pay for strictlimit-ing.
+                *
+                * If memcg domain is in effect, @dirty should be under
+                * both global and memcg freerun ceilings.
                  */
-               if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh)) {
+               if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
+                   (!mdtc ||
+                    m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
+                       unsigned long intv = dirty_poll_interval(dirty, thresh);
+                       unsigned long m_intv = ULONG_MAX;
+ 
                         current->dirty_paused_when = now;
                         current->nr_dirtied = 0;
-                       current->nr_dirtied_pause =
-                               dirty_poll_interval(dirty, thresh);
+                       if (mdtc)
+                               m_intv = dirty_poll_interval(m_dirty, m_thresh);
+                       current->nr_dirtied_pause = min(intv, m_intv);
                         break;
                 }
   
-               if (unlikely(!writeback_in_progress(bdi)))
-                       bdi_start_background_writeback(bdi);
+               if (unlikely(!writeback_in_progress(wb)))
+                       wb_start_background_writeback(wb);
   
+               /*
+                * Calculate global domain's pos_ratio and select the
+                * global dtc by default.
+                */
                 if (!strictlimit)
-                       bdi_dirty_limits(bdi, dirty_thresh, background_thresh,
-                                        &bdi_dirty, &bdi_thresh, NULL);
- 
-               dirty_exceeded = (bdi_dirty > bdi_thresh) &&
-                                ((nr_dirty > dirty_thresh) || strictlimit);
-               if (dirty_exceeded && !bdi->dirty_exceeded)
-                       bdi->dirty_exceeded = 1;
- 
-               bdi_update_bandwidth(bdi, dirty_thresh, background_thresh,
-                                    nr_dirty, bdi_thresh, bdi_dirty,
-                                    start_time);
- 
-               dirty_ratelimit = bdi->dirty_ratelimit;
-               pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
-                                              background_thresh, nr_dirty,
-                                              bdi_thresh, bdi_dirty);
-               task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
+                       wb_dirty_limits(gdtc);
+ 
+               dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
+                       ((gdtc->dirty > gdtc->thresh) || strictlimit);
+ 
+               wb_position_ratio(gdtc);
+               sdtc = gdtc;
+ 
+               if (mdtc) {
+                       /*
+                        * If memcg domain is in effect, calculate its
+                        * pos_ratio.  @wb should satisfy constraints from
+                        * both global and memcg domains.  Choose the one
+                        * w/ lower pos_ratio.
+                        */
+                       if (!strictlimit)
+                               wb_dirty_limits(mdtc);
+ 
+                       dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
+                               ((mdtc->dirty > mdtc->thresh) || strictlimit);
+ 
+                       wb_position_ratio(mdtc);
+                       if (mdtc->pos_ratio < gdtc->pos_ratio)
+                               sdtc = mdtc;
+               }
+ 
+               if (dirty_exceeded && !wb->dirty_exceeded)
+                       wb->dirty_exceeded = 1;
+ 
+               if (time_is_before_jiffies(wb->bw_time_stamp +
+                                          BANDWIDTH_INTERVAL)) {
+                       spin_lock(&wb->list_lock);
+                       __wb_update_bandwidth(gdtc, mdtc, start_time, true);
+                       spin_unlock(&wb->list_lock);
+               }
+ 
+               /* throttle according to the chosen dtc */
+               dirty_ratelimit = wb->dirty_ratelimit;
+               task_ratelimit = ((u64)dirty_ratelimit * sdtc->pos_ratio) >>
                                                         RATELIMIT_CALC_SHIFT;
-               max_pause = bdi_max_pause(bdi, bdi_dirty);
-               min_pause = bdi_min_pause(bdi, max_pause,
-                                         task_ratelimit, dirty_ratelimit,
-                                         &nr_dirtied_pause);
+               max_pause = wb_max_pause(wb, sdtc->wb_dirty);
+               min_pause = wb_min_pause(wb, max_pause,
+                                        task_ratelimit, dirty_ratelimit,
+                                        &nr_dirtied_pause);
   
                 if (unlikely(task_ratelimit == 0)) {
                         period = max_pause;
@@@ -1452,11 -1684,11 +1684,11 @@@
                  */
                 if (pause < min_pause) {
                         trace_balance_dirty_pages(bdi,
-                                                 dirty_thresh,
-                                                 background_thresh,
-                                                 nr_dirty,
-                                                 bdi_thresh,
-                                                 bdi_dirty,
+                                                 sdtc->thresh,
+                                                 sdtc->bg_thresh,
+                                                 sdtc->dirty,
+                                                 sdtc->wb_thresh,
+                                                 sdtc->wb_dirty,
                                                   dirty_ratelimit,
                                                   task_ratelimit,
                                                   pages_dirtied,
@@@ -1481,11 -1713,11 +1713,11 @@@
   
   pause:
                 trace_balance_dirty_pages(bdi,
-                                         dirty_thresh,
-                                         background_thresh,
-                                         nr_dirty,
-                                         bdi_thresh,
-                                         bdi_dirty,
+                                         sdtc->thresh,
+                                         sdtc->bg_thresh,
+                                         sdtc->dirty,
+                                         sdtc->wb_thresh,
+                                         sdtc->wb_dirty,
                                           dirty_ratelimit,
                                           task_ratelimit,
                                           pages_dirtied,
@@@ -1500,33 -1732,33 +1732,33 @@@
                 current->nr_dirtied_pause = nr_dirtied_pause;
   
                 /*
-                * This is typically equal to (nr_dirty < dirty_thresh) and can
-                * also keep "1000+ dd on a slow USB stick" under control.
+                * This is typically equal to (dirty < thresh) and can also
+                * keep "1000+ dd on a slow USB stick" under control.
                  */
                 if (task_ratelimit)
                         break;
   
                 /*
                  * In the case of an unresponding NFS server and the NFS dirty
-                * pages exceeds dirty_thresh, give the other good bdi's a pipe
+                * pages exceeds dirty_thresh, give the other good wb's a pipe
                  * to go through, so that tasks on them still remain responsive.
                  *
                  * In theory 1 page is enough to keep the comsumer-producer
                  * pipe going: the flusher cleans 1 page => the task dirties 1
-                * more page. However bdi_dirty has accounting errors.  So use
-                * the larger and more IO friendly bdi_stat_error.
+                * more page. However wb_dirty has accounting errors.  So use
+                * the larger and more IO friendly wb_stat_error.
                  */
-               if (bdi_dirty <= bdi_stat_error(bdi))
+               if (sdtc->wb_dirty <= wb_stat_error(wb))
                         break;
   
                 if (fatal_signal_pending(current))
                         break;
         }
   
-       if (!dirty_exceeded && bdi->dirty_exceeded)
-               bdi->dirty_exceeded = 0;
+       if (!dirty_exceeded && wb->dirty_exceeded)
+               wb->dirty_exceeded = 0;
   
-       if (writeback_in_progress(bdi))
+       if (writeback_in_progress(wb))
                 return;
   
         /*
@@@ -1540,8 -1772,8 +1772,8 @@@
         if (laptop_mode)
                 return;
   
-       if (nr_reclaimable > background_thresh)
-               bdi_start_background_writeback(bdi);
+       if (nr_reclaimable > gdtc->bg_thresh)
+               wb_start_background_writeback(wb);
   }
   
   static DEFINE_PER_CPU(int, bdp_ratelimits);
@@@ -1577,15 -1809,22 +1809,22 @@@ DEFINE_PER_CPU(int, dirty_throttle_leak
    */
   void balance_dirty_pages_ratelimited(struct address_space *mapping)
   {
-       struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+       struct inode *inode = mapping->host;
+       struct backing_dev_info *bdi = inode_to_bdi(inode);
+       struct bdi_writeback *wb = NULL;
         int ratelimit;
         int *p;
   
         if (!bdi_cap_account_dirty(bdi))
                 return;
   
+       if (inode_cgwb_enabled(inode))
+               wb = wb_get_create_current(bdi, GFP_KERNEL);
+       if (!wb)
+               wb = &bdi->wb;
+ 
         ratelimit = current->nr_dirtied_pause;
-       if (bdi->dirty_exceeded)
+       if (wb->dirty_exceeded)
                 ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
   
         preempt_disable();
@@@ -1617,10 -1856,59 +1856,59 @@@
         preempt_enable();
   
         if (unlikely(current->nr_dirtied >= ratelimit))
-               balance_dirty_pages(mapping, current->nr_dirtied);
+               balance_dirty_pages(mapping, wb, current->nr_dirtied);
+ 
+       wb_put(wb);
   }
   EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
   
+ /**
+  * wb_over_bg_thresh - does @wb need to be written back?
+  * @wb: bdi_writeback of interest
+  *
+  * Determines whether background writeback should keep writing @wb or it's
+  * clean enough.  Returns %true if writeback should continue.
+  */
+ bool wb_over_bg_thresh(struct bdi_writeback *wb)
+ {
+       struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) };
+       struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) };
+       struct dirty_throttle_control * const gdtc = &gdtc_stor;
+       struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
+                                                    &mdtc_stor : NULL;
+ 
+       /*
+        * Similar to balance_dirty_pages() but ignores pages being written
+        * as we're trying to decide whether to put more under writeback.
+        */
+       gdtc->avail = global_dirtyable_memory();
+       gdtc->dirty = global_page_state(NR_FILE_DIRTY) +
+                     global_page_state(NR_UNSTABLE_NFS);
+       domain_dirty_limits(gdtc);
+ 
+       if (gdtc->dirty > gdtc->bg_thresh)
+               return true;
+ 
+       if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(gdtc))
+               return true;
+ 
+       if (mdtc) {
+               unsigned long writeback;
+ 
+               mem_cgroup_wb_stats(wb, &mdtc->avail, &mdtc->dirty, &writeback);
+               mdtc_cap_avail(mdtc);
+               domain_dirty_limits(mdtc);      /* ditto, ignore writeback */
+ 
+               if (mdtc->dirty > mdtc->bg_thresh)
+                       return true;
+ 
+               if (wb_stat(wb, WB_RECLAIMABLE) > __wb_calc_thresh(mdtc))
+                       return true;
+       }
+ 
+       return false;
+ }
+ 
   void throttle_vm_writeout(gfp_t gfp_mask)
   {
         unsigned long background_thresh;
@@@ -1628,7 -1916,7 +1916,7 @@@
   
           for ( ; ; ) {
                 global_dirty_limits(&background_thresh, &dirty_thresh);
-               dirty_thresh = hard_dirty_limit(dirty_thresh);
+               dirty_thresh = hard_dirty_limit(&global_wb_domain, dirty_thresh);
   
                   /*
                    * Boost the allowable dirty threshold a bit for page
@@@ -1667,14 -1955,20 +1955,20 @@@ void laptop_mode_timer_fn(unsigned lon
         struct request_queue *q = (struct request_queue *)data;
         int nr_pages = global_page_state(NR_FILE_DIRTY) +
                 global_page_state(NR_UNSTABLE_NFS);
+       struct bdi_writeback *wb;
+       struct wb_iter iter;
   
         /*
          * We want to write everything out, not just down to the dirty
          * threshold
          */
-       if (bdi_has_dirty_io(&q->backing_dev_info))
-               bdi_start_writeback(&q->backing_dev_info, nr_pages,
-                                       WB_REASON_LAPTOP_TIMER);
+       if (!bdi_has_dirty_io(&q->backing_dev_info))
+               return;
+ 
+       bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+               if (wb_has_dirty_io(wb))
+                       wb_start_writeback(wb, nr_pages, true,
+                                          WB_REASON_LAPTOP_TIMER);
   }
   
   /*
@@@ -1718,10 -2012,12 +2012,12 @@@ void laptop_sync_completion(void
   
   void writeback_set_ratelimit(void)
   {
+       struct wb_domain *dom = &global_wb_domain;
         unsigned long background_thresh;
         unsigned long dirty_thresh;
+ 
         global_dirty_limits(&background_thresh, &dirty_thresh);
-       global_dirty_limit = dirty_thresh;
+       dom->dirty_limit = dirty_thresh;
         ratelimit_pages = dirty_thresh / (num_online_cpus() * 32);
         if (ratelimit_pages < 16)
                 ratelimit_pages = 16;
@@@ -1770,7 -2066,7 +2066,7 @@@ void __init page_writeback_init(void
         writeback_set_ratelimit();
         register_cpu_notifier(&ratelimit_nb);
   
-       fprop_global_init(&writeout_completions, GFP_KERNEL);
+       BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
   }
   
   /**
@@@ -2090,19 -2386,29 +2386,29 @@@ int __set_page_dirty_no_writeback(struc
   
   /*
    * Helper function for set_page_dirty family.
+  *
+  * Caller must hold mem_cgroup_begin_page_stat().
+  *
    * NOTE: This relies on being atomic wrt interrupts.
    */
- void account_page_dirtied(struct page *page, struct address_space *mapping)
+ void account_page_dirtied(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg)
   {
+       struct inode *inode = mapping->host;
+ 
         trace_writeback_dirty_page(page, mapping);
   
         if (mapping_cap_account_dirty(mapping)) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct bdi_writeback *wb;
   
+               inode_attach_wb(inode, page);
+               wb = inode_to_wb(inode);
+ 
+               mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                 __inc_zone_page_state(page, NR_FILE_DIRTY);
                 __inc_zone_page_state(page, NR_DIRTIED);
-               __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
-               __inc_bdi_stat(bdi, BDI_DIRTIED);
+               __inc_wb_stat(wb, WB_RECLAIMABLE);
+               __inc_wb_stat(wb, WB_DIRTIED);
                 task_io_account_write(PAGE_CACHE_SIZE);
                 current->nr_dirtied++;
                 this_cpu_inc(bdp_ratelimits);
@@@ -2113,21 -2419,18 +2419,18 @@@ EXPORT_SYMBOL(account_page_dirtied)
   /*
    * Helper function for deaccounting dirty page without writeback.
    *
-  * Doing this should *normally* only ever be done when a page
-  * is truncated, and is not actually mapped anywhere at all. However,
-  * fs/buffer.c does this when it notices that somebody has cleaned
-  * out all the buffers on a page without actually doing it through
-  * the VM. Can you say "ext3 is horribly ugly"? Thought you could.
+  * Caller must hold mem_cgroup_begin_page_stat().
    */
- void account_page_cleaned(struct page *page, struct address_space *mapping)
+ void account_page_cleaned(struct page *page, struct address_space *mapping,
+                         struct mem_cgroup *memcg, struct bdi_writeback *wb)
   {
         if (mapping_cap_account_dirty(mapping)) {
+               mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                 dec_zone_page_state(page, NR_FILE_DIRTY);
-               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
+               dec_wb_stat(wb, WB_RECLAIMABLE);
                 task_io_account_cancelled_write(PAGE_CACHE_SIZE);
         }
   }
- EXPORT_SYMBOL(account_page_cleaned);
   
   /*
    * For address_spaces which do not use buffers.  Just tag the page as dirty in
@@@ -2143,26 -2446,34 +2446,34 @@@
    */
   int __set_page_dirty_nobuffers(struct page *page)
   {
+       struct mem_cgroup *memcg;
+ 
+       memcg = mem_cgroup_begin_page_stat(page);
         if (!TestSetPageDirty(page)) {
                 struct address_space *mapping = page_mapping(page);
                 unsigned long flags;
   
-               if (!mapping)
+               if (!mapping) {
+                       mem_cgroup_end_page_stat(memcg);
                         return 1;
+               }
   
                 spin_lock_irqsave(&mapping->tree_lock, flags);
                 BUG_ON(page_mapping(page) != mapping);
                 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
-               account_page_dirtied(page, mapping);
+               account_page_dirtied(page, mapping, memcg);
                 radix_tree_tag_set(&mapping->page_tree, page_index(page),
                                    PAGECACHE_TAG_DIRTY);
                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
+ 
                 if (mapping->host) {
                         /* !PageAnon && !swapper_space */
                         __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
                 }
                 return 1;
         }
+       mem_cgroup_end_page_stat(memcg);
         return 0;
   }
   EXPORT_SYMBOL(__set_page_dirty_nobuffers);
@@@ -2177,10 -2488,17 +2488,17 @@@
   void account_page_redirty(struct page *page)
   {
         struct address_space *mapping = page->mapping;
+ 
         if (mapping && mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               bool locked;
+ 
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
                 current->nr_dirtied--;
                 dec_zone_page_state(page, NR_DIRTIED);
-               dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
+               dec_wb_stat(wb, WB_DIRTIED);
+               unlocked_inode_to_wb_end(inode, locked);
         }
   }
   EXPORT_SYMBOL(account_page_redirty);
@@@ -2265,6 -2583,43 +2583,43 @@@ int set_page_dirty_lock(struct page *pa
   }
   EXPORT_SYMBOL(set_page_dirty_lock);
   
+ /*
+  * This cancels just the dirty bit on the kernel page itself, it does NOT
+  * actually remove dirty bits on any mmap's that may be around. It also
+  * leaves the page tagged dirty, so any sync activity will still find it on
+  * the dirty lists, and in particular, clear_page_dirty_for_io() will still
+  * look at the dirty bits in the VM.
+  *
+  * Doing this should *normally* only ever be done when a page is truncated,
+  * and is not actually mapped anywhere at all. However, fs/buffer.c does
+  * this when it notices that somebody has cleaned out all the buffers on a
+  * page without actually doing it through the VM. Can you say "ext3 is
+  * horribly ugly"? Thought you could.
+  */
+ void cancel_dirty_page(struct page *page)
+ {
+       struct address_space *mapping = page_mapping(page);
+ 
+       if (mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               struct mem_cgroup *memcg;
+               bool locked;
+ 
+               memcg = mem_cgroup_begin_page_stat(page);
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
+ 
+               if (TestClearPageDirty(page))
+                       account_page_cleaned(page, mapping, memcg, wb);
+ 
+               unlocked_inode_to_wb_end(inode, locked);
+               mem_cgroup_end_page_stat(memcg);
+       } else {
+               ClearPageDirty(page);
+       }
+ }
+ EXPORT_SYMBOL(cancel_dirty_page);
+ 
   /*
    * Clear a page's dirty flag, while caring for dirty memory accounting.
    * Returns true if the page was previously dirty.
@@@ -2282,10 -2637,16 +2637,16 @@@
   int clear_page_dirty_for_io(struct page *page)
   {
         struct address_space *mapping = page_mapping(page);
+       int ret = 0;
   
         BUG_ON(!PageLocked(page));
   
         if (mapping && mapping_cap_account_dirty(mapping)) {
+               struct inode *inode = mapping->host;
+               struct bdi_writeback *wb;
+               struct mem_cgroup *memcg;
+               bool locked;
+ 
                 /*
                  * Yes, Virginia, this is indeed insane.
                  *
@@@ -2321,13 -2682,17 +2682,17 @@@
                  * always locked coming in here, so we get the desired
                  * exclusion.
                  */
+               memcg = mem_cgroup_begin_page_stat(page);
+               wb = unlocked_inode_to_wb_begin(inode, &locked);
                 if (TestClearPageDirty(page)) {
+                       mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_DIRTY);
                         dec_zone_page_state(page, NR_FILE_DIRTY);
-                       dec_bdi_stat(inode_to_bdi(mapping->host),
-                                       BDI_RECLAIMABLE);
-                       return 1;
+                       dec_wb_stat(wb, WB_RECLAIMABLE);
+                       ret = 1;
                 }
-               return 0;
+               unlocked_inode_to_wb_end(inode, locked);
+               mem_cgroup_end_page_stat(memcg);
+               return ret;
         }
         return TestClearPageDirty(page);
   }
@@@ -2341,7 -2706,8 +2706,8 @@@ int test_clear_page_writeback(struct pa
   
         memcg = mem_cgroup_begin_page_stat(page);
         if (mapping) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct inode *inode = mapping->host;
+               struct backing_dev_info *bdi = inode_to_bdi(inode);
                 unsigned long flags;
   
                 spin_lock_irqsave(&mapping->tree_lock, flags);
@@@ -2351,8 -2717,10 +2717,10 @@@
                                                 page_index(page),
                                                 PAGECACHE_TAG_WRITEBACK);
                         if (bdi_cap_account_writeback(bdi)) {
-                               __dec_bdi_stat(bdi, BDI_WRITEBACK);
-                               __bdi_writeout_inc(bdi);
+                               struct bdi_writeback *wb = inode_to_wb(inode);
+ 
+                               __dec_wb_stat(wb, WB_WRITEBACK);
+                               __wb_writeout_inc(wb);
                         }
                 }
                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
@@@ -2376,7 -2744,8 +2744,8 @@@ int __test_set_page_writeback(struct pa
   
         memcg = mem_cgroup_begin_page_stat(page);
         if (mapping) {
-               struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+               struct inode *inode = mapping->host;
+               struct backing_dev_info *bdi = inode_to_bdi(inode);
                 unsigned long flags;
   
                 spin_lock_irqsave(&mapping->tree_lock, flags);
@@@ -2386,7 -2755,7 +2755,7 @@@
                                                 page_index(page),
                                                 PAGECACHE_TAG_WRITEBACK);
                         if (bdi_cap_account_writeback(bdi))
-                               __inc_bdi_stat(bdi, BDI_WRITEBACK);
+                               __inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
                 }
                 if (!PageDirty(page))
                         radix_tree_tag_clear(&mapping->page_tree,
diff --combined mm/rmap.c

index 7af1ecb21ccb2d560ca9f0f21e002a26465737f4,8fc556ce2dcb7f538c81a7c590c769d9da34aa6a..171b68768df1478355bcddd5e30c2edd616ba05b
--- 1/mm/rmap.c
--- 2/mm/rmap.c
+++ b/mm/rmap.c
@@@ -30,6 -30,8 +30,8 @@@
    *             swap_lock (in swap_duplicate, swap_info_get)
    *               mmlist_lock (in mmput, drain_mmlist and others)
    *               mapping->private_lock (in __set_page_dirty_buffers)
+  *                 mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+  *                   mapping->tree_lock (widely used)
    *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
    *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
    *                 sb_lock (within inode_lock in fs/fs-writeback.c)
@@@ -625,7 -627,7 +627,7 @@@ pmd_t *mm_find_pmd(struct mm_struct *mm
   
         pmd = pmd_offset(pud, address);
         /*
- -       * Some THP functions use the sequence pmdp_clear_flush(), set_pmd_at()
+ +       * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
          * without holding anon_vma lock for write.  So when looking for a
          * genuine pmde (in which to find pte), test present and !THP together.
          */
@@@ -950,12 -952,7 +952,12 @@@ void page_move_anon_rmap(struct page *p
         VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page);
   
         anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- -      page->mapping = (struct address_space *) anon_vma;
+ +      /*
+ +       * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written
+ +       * simultaneously, so a concurrent reader (eg page_referenced()'s
+ +       * PageAnon()) will not see one without the other.
+ +       */
+ +      WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
   }
   
   /**
diff --combined mm/vmscan.c

index 19ef01e90ac42077c3d7898d5ef0d149a166b1aa,8cb16ebaf3ed083a73fc9ed8104a20de71e77dde..e61445dce04e3cc83e9704e84f3d5bf9074b31db
--- 1/mm/vmscan.c
--- 2/mm/vmscan.c
+++ b/mm/vmscan.c
@@@ -154,11 -154,42 +154,42 @@@ static bool global_reclaim(struct scan_
   {
         return !sc->target_mem_cgroup;
   }
+ 
+ /**
+  * sane_reclaim - is the usual dirty throttling mechanism operational?
+  * @sc: scan_control in question
+  *
+  * The normal page dirty throttling mechanism in balance_dirty_pages() is
+  * completely broken with the legacy memcg and direct stalling in
+  * shrink_page_list() is used for throttling instead, which lacks all the
+  * niceties such as fairness, adaptive pausing, bandwidth proportional
+  * allocation and configurability.
+  *
+  * This function tests whether the vmscan currently in progress can assume
+  * that the normal dirty throttling mechanism is operational.
+  */
+ static bool sane_reclaim(struct scan_control *sc)
+ {
+       struct mem_cgroup *memcg = sc->target_mem_cgroup;
+ 
+       if (!memcg)
+               return true;
+ #ifdef CONFIG_CGROUP_WRITEBACK
+       if (cgroup_on_dfl(mem_cgroup_css(memcg)->cgroup))
+               return true;
+ #endif
+       return false;
+ }
   #else
   static bool global_reclaim(struct scan_control *sc)
   {
         return true;
   }
+ 
+ static bool sane_reclaim(struct scan_control *sc)
+ {
+       return true;
+ }
   #endif
   
   static unsigned long zone_reclaimable_pages(struct zone *zone)
@@@ -452,14 -483,13 +483,13 @@@ static inline int is_page_cache_freeabl
         return page_count(page) - page_has_private(page) == 2;
   }
   
- static int may_write_to_queue(struct backing_dev_info *bdi,
-                             struct scan_control *sc)
+ static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
   {
         if (current->flags & PF_SWAPWRITE)
                 return 1;
-       if (!bdi_write_congested(bdi))
+       if (!inode_write_congested(inode))
                 return 1;
-       if (bdi == current->backing_dev_info)
+       if (inode_to_bdi(inode) == current->backing_dev_info)
                 return 1;
         return 0;
   }
@@@ -538,7 -568,7 +568,7 @@@ static pageout_t pageout(struct page *p
         }
         if (mapping->a_ops->writepage == NULL)
                 return PAGE_ACTIVATE;
-       if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
+       if (!may_write_to_inode(mapping->host, sc))
                 return PAGE_KEEP;
   
         if (clear_page_dirty_for_io(page)) {
@@@ -579,10 -609,14 +609,14 @@@
   static int __remove_mapping(struct address_space *mapping, struct page *page,
                             bool reclaimed)
   {
+       unsigned long flags;
+       struct mem_cgroup *memcg;
+ 
         BUG_ON(!PageLocked(page));
         BUG_ON(mapping != page_mapping(page));
   
-       spin_lock_irq(&mapping->tree_lock);
+       memcg = mem_cgroup_begin_page_stat(page);
+       spin_lock_irqsave(&mapping->tree_lock, flags);
         /*
          * The non racy check for a busy page.
          *
@@@ -620,7 -654,8 +654,8 @@@
                 swp_entry_t swap = { .val = page_private(page) };
                 mem_cgroup_swapout(page, swap);
                 __delete_from_swap_cache(page);
-               spin_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
                 swapcache_free(swap);
         } else {
                 void (*freepage)(struct page *);
@@@ -640,8 -675,9 +675,9 @@@
                 if (reclaimed && page_is_file_cache(page) &&
                     !mapping_exiting(mapping))
                         shadow = workingset_eviction(mapping, page);
-               __delete_from_page_cache(page, shadow);
-               spin_unlock_irq(&mapping->tree_lock);
+               __delete_from_page_cache(page, shadow, memcg);
+               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               mem_cgroup_end_page_stat(memcg);
   
                 if (freepage != NULL)
                         freepage(page);
@@@ -650,7 -686,8 +686,8 @@@
         return 1;
   
   cannot_free:
-       spin_unlock_irq(&mapping->tree_lock);
+       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       mem_cgroup_end_page_stat(memcg);
         return 0;
   }
   
@@@ -917,7 -954,7 +954,7 @@@ static unsigned long shrink_page_list(s
                  */
                 mapping = page_mapping(page);
                 if (((dirty || writeback) && mapping &&
-                    bdi_write_congested(inode_to_bdi(mapping->host))) ||
+                    inode_write_congested(mapping->host)) ||
                     (writeback && PageReclaim(page)))
                         nr_congested++;
   
@@@ -935,10 -972,10 +972,10 @@@
                  *    note that the LRU is being scanned too quickly and the
                  *    caller can stall after page list has been processed.
                  *
-                * 2) Global reclaim encounters a page, memcg encounters a
-                *    page that is not marked for immediate reclaim or
-                *    the caller does not have __GFP_IO. In this case mark
-                *    the page for immediate reclaim and continue scanning.
+                * 2) Global or new memcg reclaim encounters a page that is
+                *    not marked for immediate reclaim or the caller does not
+                *    have __GFP_IO. In this case mark the page for immediate
+                *    reclaim and continue scanning.
                  *
                  *    __GFP_IO is checked  because a loop driver thread might
                  *    enter reclaim, and deadlock if it waits on a page for
@@@ -952,7 -989,7 +989,7 @@@
                  *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
                  *    may_enter_fs here is liable to OOM on them.
                  *
-                * 3) memcg encounters a page that is not already marked
+                * 3) Legacy memcg encounters a page that is not already marked
                  *    PageReclaim. memcg does not have any dirty pages
                  *    throttling so we could easily OOM just because too many
                  *    pages are in writeback and there is nothing else to
@@@ -967,7 -1004,7 +1004,7 @@@
                                 goto keep_locked;
   
                         /* Case 2 above */
-                       } else if (global_reclaim(sc) ||
+                       } else if (sane_reclaim(sc) ||
                             !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
                                 /*
                                  * This is slightly racy - end_page_writeback()
@@@ -1416,7 -1453,7 +1453,7 @@@ static int too_many_isolated(struct zon
         if (current_is_kswapd())
                 return 0;
   
-       if (!global_reclaim(sc))
+       if (!sane_reclaim(sc))
                 return 0;
   
         if (file) {
@@@ -1608,10 -1645,10 +1645,10 @@@ shrink_inactive_list(unsigned long nr_t
                 set_bit(ZONE_WRITEBACK, &zone->flags);
   
         /*
-        * memcg will stall in page writeback so only consider forcibly
-        * stalling for global reclaim
+        * Legacy memcg will stall in page writeback so avoid forcibly
+        * stalling here.
          */
-       if (global_reclaim(sc)) {
+       if (sane_reclaim(sc)) {
                 /*
                  * Tag a zone as congested if all the dirty pages scanned were
                  * backed by a congested BDI and wait_iff_congested will stall.
@@@ -2646,8 -2683,7 +2683,8 @@@ static bool pfmemalloc_watermark_ok(pg_
   
         for (i = 0; i <= ZONE_NORMAL; i++) {
                 zone = &pgdat->node_zones[i];
- -              if (!populated_zone(zone))
+ +              if (!populated_zone(zone) ||
+ +                  zone_reclaimable_pages(zone) == 0)
                         continue;
   
                 pfmemalloc_reserve += min_wmark_pages(zone);
@@@ -3597,7 -3633,7 +3634,7 @@@ int zone_reclaim_mode __read_mostly
   #define RECLAIM_OFF 0
   #define RECLAIM_ZONE (1<<0)   /* Run shrink_inactive_list on the zone */
   #define RECLAIM_WRITE (1<<1)  /* Writeout pages during reclaim */
- -#define RECLAIM_SWAP (1<<2)   /* Swap pages out during reclaim */
+ +#define RECLAIM_UNMAP (1<<2)  /* Unmap pages during reclaim */
   
   /*
    * Priority for ZONE_RECLAIM. This determines the fraction of pages
@@@ -3639,12 -3675,12 +3676,12 @@@ static long zone_pagecache_reclaimable(
         long delta = 0;
   
         /*
- -       * If RECLAIM_SWAP is set, then all file pages are considered
+ +       * If RECLAIM_UNMAP is set, then all file pages are considered
          * potentially reclaimable. Otherwise, we have to worry about
          * pages like swapcache and zone_unmapped_file_pages() provides
          * a better estimate
          */
- -      if (zone_reclaim_mode & RECLAIM_SWAP)
+ +      if (zone_reclaim_mode & RECLAIM_UNMAP)
                 nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
         else
                 nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
@@@ -3675,15 -3711,15 +3712,15 @@@ static int __zone_reclaim(struct zone *
                 .order = order,
                 .priority = ZONE_RECLAIM_PRIORITY,
                 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
- -              .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ +              .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP),
                 .may_swap = 1,
         };
   
         cond_resched();
         /*
- -       * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ +       * We need to be able to allocate from the reserves for RECLAIM_UNMAP
          * and we also need to be able to write out pages for RECLAIM_WRITE
- -       * and RECLAIM_SWAP.
+ +       * and RECLAIM_UNMAP.
          */
         p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
         lockdep_set_current_reclaim_state(gfp_mask);
author	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 25 Jun 2015 23:00:17 +0000 (16:00 -0700)
		1	2
block/blk-cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/blk-sysfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/bounce.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/cfq-iosched.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/elevator.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/genhd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/dm.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/md/raid10.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/extents.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/mballoc.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ext4/super.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/node.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/f2fs/segment.h	patch \|	diff1 \|	diff2 \|	blob \| history
fs/inode.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/nfs/write.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/ocfs2/file.c	patch \|	diff1 \|	diff2 \|	blob \| history
fs/xfs/xfs_file.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/backing-dev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blk-cgroup.h	patch \|	\|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/fs.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/memcontrol.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/mm.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/trace/events/writeback.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
mm/backing-dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/page-writeback.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/rmap.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/vmscan.c	patch \|	diff1 \|	diff2 \|	blob \| history