Merge commit 'v2.6.38-rc6' into for-2.6.39/core

author Jens Axboe <jaxboe@fusionio.com>

Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)

committer Jens Axboe <jaxboe@fusionio.com>

Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)
author Jens Axboe <jaxboe@fusionio.com>
Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)
committer Jens Axboe <jaxboe@fusionio.com>
Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)
diff --combined block/blk-core.c

index ab4a7696956d9c2ae4599e9cd4261caf958f18a0,2f4002f79a24b3cf242c870282d96859dc475dc9..3cc17e6064d68e5a3315d012a395c84f660bcb8a
--- 1/block/blk-core.c
--- 2/block/blk-core.c
+++ b/block/blk-core.c
@@@ -33,7 -33,7 +33,7 @@@
   
   #include "blk.h"
   
- EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+ EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
   EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
   
@@@ -64,13 -64,27 +64,27 @@@ static void drive_stat_acct(struct requ
                 return;
   
         cpu = part_stat_lock();
-       part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
   
-       if (!new_io)
+       if (!new_io) {
+               part = rq->part;
                 part_stat_inc(cpu, part, merges[rw]);
-       else {
+       } else {
+               part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+               if (!hd_struct_try_get(part)) {
+                       /*
+                        * The partition is already being removed,
+                        * the request will be accounted on the disk only
+                        *
+                        * We take a reference on disk->part0 although that
+                        * partition will never be deleted, so we can treat
+                        * it as any other partition.
+                        */
+                       part = &rq->rq_disk->part0;
+                       hd_struct_get(part);
+               }
                 part_round_stats(cpu, part);
                 part_inc_in_flight(part, rw);
+               rq->part = part;
         }
   
         part_stat_unlock();
@@@ -128,35 -142,46 +142,36 @@@ void blk_rq_init(struct request_queue *
         rq->ref_count = 1;
         rq->start_time = jiffies;
         set_start_time_ns(rq);
+       rq->part = NULL;
   }
   EXPORT_SYMBOL(blk_rq_init);
   
   static void req_bio_endio(struct request *rq, struct bio *bio,
                           unsigned int nbytes, int error)
   {
- -      struct request_queue *q = rq->q;
- -
- -      if (&q->flush_rq != rq) {
- -              if (error)
- -                      clear_bit(BIO_UPTODATE, &bio->bi_flags);
- -              else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- -                      error = -EIO;
- -
- -              if (unlikely(nbytes > bio->bi_size)) {
- -                      printk(KERN_ERR "%s: want %u bytes done, %u left\n",
- -                             __func__, nbytes, bio->bi_size);
- -                      nbytes = bio->bi_size;
- -              }
+ +      if (error)
+ +              clear_bit(BIO_UPTODATE, &bio->bi_flags);
+ +      else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+ +              error = -EIO;
+ +
+ +      if (unlikely(nbytes > bio->bi_size)) {
+ +              printk(KERN_ERR "%s: want %u bytes done, %u left\n",
+ +                     __func__, nbytes, bio->bi_size);
+ +              nbytes = bio->bi_size;
+ +      }
   
- -              if (unlikely(rq->cmd_flags & REQ_QUIET))
- -                      set_bit(BIO_QUIET, &bio->bi_flags);
+ +      if (unlikely(rq->cmd_flags & REQ_QUIET))
+ +              set_bit(BIO_QUIET, &bio->bi_flags);
   
- -              bio->bi_size -= nbytes;
- -              bio->bi_sector += (nbytes >> 9);
+ +      bio->bi_size -= nbytes;
+ +      bio->bi_sector += (nbytes >> 9);
   
- -              if (bio_integrity(bio))
- -                      bio_integrity_advance(bio, nbytes);
+ +      if (bio_integrity(bio))
+ +              bio_integrity_advance(bio, nbytes);
   
- -              if (bio->bi_size == 0)
- -                      bio_endio(bio, error);
- -      } else {
- -              /*
- -               * Okay, this is the sequenced flush request in
- -               * progress, just record the error;
- -               */
- -              if (error && !q->flush_err)
- -                      q->flush_err = error;
- -      }
+ +      /* don't actually finish bio if it's part of flush sequence */
+ +      if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
+ +              bio_endio(bio, error);
   }
   
   void blk_dump_rq_flags(struct request *rq, char *msg)
@@@ -515,9 -540,7 +530,9 @@@ struct request_queue *blk_alloc_queue_n
         init_timer(&q->unplug_timer);
         setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
         INIT_LIST_HEAD(&q->timeout_list);
- -      INIT_LIST_HEAD(&q->pending_flushes);
+ +      INIT_LIST_HEAD(&q->flush_queue[0]);
+ +      INIT_LIST_HEAD(&q->flush_queue[1]);
+ +      INIT_LIST_HEAD(&q->flush_data_in_flight);
         INIT_WORK(&q->unplug_work, blk_unplug_work);
   
         kobject_init(&q->kobj, &blk_queue_ktype);
@@@ -737,25 -760,6 +752,25 @@@ static void freed_request(struct reques
                 __freed_request(q, sync ^ 1);
   }
   
+ +/*
+ + * Determine if elevator data should be initialized when allocating the
+ + * request associated with @bio.
+ + */
+ +static bool blk_rq_should_init_elevator(struct bio *bio)
+ +{
+ +      if (!bio)
+ +              return true;
+ +
+ +      /*
+ +       * Flush requests do not use the elevator so skip initialization.
+ +       * This allows a request to share the flush and elevator data.
+ +       */
+ +      if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
+ +              return false;
+ +
+ +      return true;
+ +}
+ +
   /*
    * Get a free request, queue_lock must be held.
    * Returns NULL on failure, with queue_lock held.
@@@ -768,7 -772,7 +783,7 @@@ static struct request *get_request(stru
         struct request_list *rl = &q->rq;
         struct io_context *ioc = NULL;
         const bool is_sync = rw_is_sync(rw_flags) != 0;
- -      int may_queue, priv;
+ +      int may_queue, priv = 0;
   
         may_queue = elv_may_queue(q, rw_flags);
         if (may_queue == ELV_MQUEUE_NO)
@@@ -812,11 -816,9 +827,11 @@@
         rl->count[is_sync]++;
         rl->starved[is_sync] = 0;
   
- -      priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
- -      if (priv)
- -              rl->elvpriv++;
+ +      if (blk_rq_should_init_elevator(bio)) {
+ +              priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
+ +              if (priv)
+ +                      rl->elvpriv++;
+ +      }
   
         if (blk_queue_io_stat(q))
                 rw_flags |= REQ_IO_STAT;
@@@ -1217,7 -1219,7 +1232,7 @@@ static int __make_request(struct reques
         spin_lock_irq(q->queue_lock);
   
         if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
- -              where = ELEVATOR_INSERT_FRONT;
+ +              where = ELEVATOR_INSERT_FLUSH;
                 goto get_rq;
         }
   
@@@ -1342,9 -1344,9 +1357,9 @@@ static inline void blk_partition_remap(
                 bio->bi_sector += p->start_sect;
                 bio->bi_bdev = bdev->bd_contains;
   
-               trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                   bdev->bd_dev,
-                                   bio->bi_sector - p->start_sect);
+               trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
+                                     bdev->bd_dev,
+                                     bio->bi_sector - p->start_sect);
         }
   }
   
@@@ -1513,7 -1515,7 +1528,7 @@@ static inline void __generic_make_reque
                         goto end_io;
   
                 if (old_sector != -1)
-                       trace_block_remap(q, bio, old_dev, old_sector);
+                       trace_block_bio_remap(q, bio, old_dev, old_sector);
   
                 old_sector = bio->bi_sector;
                 old_dev = bio->bi_bdev->bd_dev;
@@@ -1789,7 -1791,7 +1804,7 @@@ static void blk_account_io_completion(s
                 int cpu;
   
                 cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
                 part_stat_add(cpu, part, sectors[rw], bytes >> 9);
                 part_stat_unlock();
         }
@@@ -1802,20 -1804,21 +1817,21 @@@ static void blk_account_io_done(struct 
          * normal IO on queueing nor completion.  Accounting the
          * containing request is enough.
          */
- -      if (blk_do_io_stat(req) && req != &req->q->flush_rq) {
+ +      if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
                 unsigned long duration = jiffies - req->start_time;
                 const int rw = rq_data_dir(req);
                 struct hd_struct *part;
                 int cpu;
   
                 cpu = part_stat_lock();
-               part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
+               part = req->part;
   
                 part_stat_inc(cpu, part, ios[rw]);
                 part_stat_add(cpu, part, ticks[rw], duration);
                 part_round_stats(cpu, part);
                 part_dec_in_flight(part, rw);
   
+               hd_struct_put(part);
                 part_stat_unlock();
         }
   }
@@@ -2619,7 -2622,9 +2635,9 @@@ int __init blk_dev_init(void
         BUILD_BUG_ON(__REQ_NR_BITS > 8 *
                         sizeof(((struct request *)0)->cmd_flags));
   
-       kblockd_workqueue = create_workqueue("kblockd");
+       /* used for unplugging and affects IO latency/throughput - HIGHPRI */
+       kblockd_workqueue = alloc_workqueue("kblockd",
+                                           WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
         if (!kblockd_workqueue)
                 panic("Failed to create kblockd\n");
   
diff --combined block/cfq-iosched.c

index 968455c57e1a1edeacc6da28fc1fea0ad4c3343c,7be4c79596250d28cb82afa9e1aef085abc6a66b..f27ff3efe6cd2cf92af18aeb0ae063f7dc38ab3a
--- 1/block/cfq-iosched.c
--- 2/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@@ -54,9 -54,9 +54,9 @@@ static const int cfq_hist_divisor = 4
   #define CFQQ_SEEKY(cfqq)      (hweight32(cfqq->seek_history) > 32/8)
   
   #define RQ_CIC(rq)            \
- -      ((struct cfq_io_context *) (rq)->elevator_private)
- -#define RQ_CFQQ(rq)           (struct cfq_queue *) ((rq)->elevator_private2)
- -#define RQ_CFQG(rq)           (struct cfq_group *) ((rq)->elevator_private3)
+ +      ((struct cfq_io_context *) (rq)->elevator_private[0])
+ +#define RQ_CFQQ(rq)           (struct cfq_queue *) ((rq)->elevator_private[1])
+ +#define RQ_CFQG(rq)           (struct cfq_group *) ((rq)->elevator_private[2])
   
   static struct kmem_cache *cfq_pool;
   static struct kmem_cache *cfq_ioc_pool;
@@@ -87,7 -87,6 +87,6 @@@ struct cfq_rb_root 
         unsigned count;
         unsigned total_weight;
         u64 min_vdisktime;
-       struct rb_node *active;
   };
   #define CFQ_RB_ROOT   (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
                         .count = 0, .min_vdisktime = 0, }
@@@ -97,7 -96,7 +96,7 @@@
    */
   struct cfq_queue {
         /* reference count */
-       atomic_t ref;
+       int ref;
         /* various state flags, see below */
         unsigned int flags;
         /* parent cfq_data */
@@@ -180,7 -179,6 +179,6 @@@ struct cfq_group 
         /* group service_tree key */
         u64 vdisktime;
         unsigned int weight;
-       bool on_st;
   
         /* number of cfqq currently on this group */
         int nr_cfqq;
@@@ -209,7 -207,7 +207,7 @@@
         struct blkio_group blkg;
   #ifdef CONFIG_CFQ_GROUP_IOSCHED
         struct hlist_node cfqd_node;
-       atomic_t ref;
+       int ref;
   #endif
         /* number of requests that are on the dispatch list or inside driver */
         int dispatched;
@@@ -563,11 -561,6 +561,6 @@@ static void update_min_vdisktime(struc
         u64 vdisktime = st->min_vdisktime;
         struct cfq_group *cfqg;
   
-       if (st->active) {
-               cfqg = rb_entry_cfqg(st->active);
-               vdisktime = cfqg->vdisktime;
-       }
- 
         if (st->left) {
                 cfqg = rb_entry_cfqg(st->left);
                 vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
@@@ -605,8 -598,8 +598,8 @@@ cfq_group_slice(struct cfq_data *cfqd, 
         return cfq_target_latency * cfqg->weight / st->total_weight;
   }
   
- static inline void
- cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ static inline unsigned
+ cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
   {
         unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
         if (cfqd->cfq_latency) {
@@@ -632,6 -625,14 +625,14 @@@
                                     low_slice);
                 }
         }
+       return slice;
+ }
+ 
+ static inline void
+ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
+ {
+       unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
+ 
         cfqq->slice_start = jiffies;
         cfqq->slice_end = jiffies + slice;
         cfqq->allocated_slice = slice;
@@@ -646,11 -647,11 +647,11 @@@
   static inline bool cfq_slice_used(struct cfq_queue *cfqq)
   {
         if (cfq_cfqq_slice_new(cfqq))
-               return 0;
+               return false;
         if (time_before(jiffies, cfqq->slice_end))
-               return 0;
+               return false;
   
-       return 1;
+       return true;
   }
   
   /*
@@@ -869,7 -870,7 +870,7 @@@ cfq_group_service_tree_add(struct cfq_d
         struct rb_node *n;
   
         cfqg->nr_cfqq++;
-       if (cfqg->on_st)
+       if (!RB_EMPTY_NODE(&cfqg->rb_node))
                 return;
   
         /*
@@@ -885,7 -886,6 +886,6 @@@
                 cfqg->vdisktime = st->min_vdisktime;
   
         __cfq_group_service_tree_add(st, cfqg);
-       cfqg->on_st = true;
         st->total_weight += cfqg->weight;
   }
   
@@@ -894,9 -894,6 +894,6 @@@ cfq_group_service_tree_del(struct cfq_d
   {
         struct cfq_rb_root *st = &cfqd->grp_service_tree;
   
-       if (st->active == &cfqg->rb_node)
-               st->active = NULL;
- 
         BUG_ON(cfqg->nr_cfqq < 1);
         cfqg->nr_cfqq--;
   
@@@ -905,7 -902,6 +902,6 @@@
                 return;
   
         cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
-       cfqg->on_st = false;
         st->total_weight -= cfqg->weight;
         if (!RB_EMPTY_NODE(&cfqg->rb_node))
                 cfq_rb_erase(&cfqg->rb_node, st);
@@@ -1026,11 -1022,11 +1022,11 @@@ cfq_find_alloc_cfqg(struct cfq_data *cf
          * elevator which will be dropped by either elevator exit
          * or cgroup deletion path depending on who is exiting first.
          */
-       atomic_set(&cfqg->ref, 1);
+       cfqg->ref = 1;
   
         /*
          * Add group onto cgroup list. It might happen that bdi->dev is
-        * not initiliazed yet. Initialize this new group without major
+        * not initialized yet. Initialize this new group without major
          * and minor info and this info will be filled in once a new thread
          * comes for IO. See code above.
          */
@@@ -1071,7 -1067,7 +1067,7 @@@ static struct cfq_group *cfq_get_cfqg(s
   
   static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
   {
-       atomic_inc(&cfqg->ref);
+       cfqg->ref++;
         return cfqg;
   }
   
@@@ -1083,7 -1079,7 +1079,7 @@@ static void cfq_link_cfqq_cfqg(struct c
   
         cfqq->cfqg = cfqg;
         /* cfqq reference on cfqg */
-       atomic_inc(&cfqq->cfqg->ref);
+       cfqq->cfqg->ref++;
   }
   
   static void cfq_put_cfqg(struct cfq_group *cfqg)
@@@ -1091,11 -1087,12 +1087,12 @@@
         struct cfq_rb_root *st;
         int i, j;
   
-       BUG_ON(atomic_read(&cfqg->ref) <= 0);
-       if (!atomic_dec_and_test(&cfqg->ref))
+       BUG_ON(cfqg->ref <= 0);
+       cfqg->ref--;
+       if (cfqg->ref)
                 return;
         for_each_cfqg_st(cfqg, i, j, st)
-               BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
+               BUG_ON(!RB_EMPTY_ROOT(&st->rb));
         kfree(cfqg);
   }
   
@@@ -1200,7 -1197,7 +1197,7 @@@ static void cfq_service_tree_add(struc
                         cfq_group_service_tree_del(cfqd, cfqq->cfqg);
                 cfqq->orig_cfqg = cfqq->cfqg;
                 cfqq->cfqg = &cfqd->root_group;
-               atomic_inc(&cfqd->root_group.ref);
+               cfqd->root_group.ref++;
                 group_changed = 1;
         } else if (!cfqd->cfq_group_isolation
                    && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
@@@ -1672,8 -1669,11 +1669,11 @@@ __cfq_slice_expired(struct cfq_data *cf
         /*
          * store what was left of this slice, if the queue idled/timed out
          */
-       if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
-               cfqq->slice_resid = cfqq->slice_end - jiffies;
+       if (timed_out) {
+               if (cfq_cfqq_slice_new(cfqq))
+                       cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
+               else
+                       cfqq->slice_resid = cfqq->slice_end - jiffies;
                 cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
         }
   
@@@ -1687,9 -1687,6 +1687,6 @@@
         if (cfqq == cfqd->active_queue)
                 cfqd->active_queue = NULL;
   
-       if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
-               cfqd->grp_service_tree.active = NULL;
- 
         if (cfqd->active_cic) {
                 put_io_context(cfqd->active_cic->ioc);
                 cfqd->active_cic = NULL;
@@@ -1901,10 -1898,10 +1898,10 @@@ static bool cfq_should_idle(struct cfq_
          * in their service tree.
          */
         if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
-               return 1;
+               return true;
         cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
                         service_tree->count);
-       return 0;
+       return false;
   }
   
   static void cfq_arm_slice_timer(struct cfq_data *cfqd)
@@@ -2040,7 -2037,7 +2037,7 @@@ static int cfqq_process_refs(struct cfq
         int process_refs, io_refs;
   
         io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
-       process_refs = atomic_read(&cfqq->ref) - io_refs;
+       process_refs = cfqq->ref - io_refs;
         BUG_ON(process_refs < 0);
         return process_refs;
   }
@@@ -2080,10 -2077,10 +2077,10 @@@ static void cfq_setup_merge(struct cfq_
          */
         if (new_process_refs >= process_refs) {
                 cfqq->new_cfqq = new_cfqq;
-               atomic_add(process_refs, &new_cfqq->ref);
+               new_cfqq->ref += process_refs;
         } else {
                 new_cfqq->new_cfqq = cfqq;
-               atomic_add(new_process_refs, &cfqq->ref);
+               cfqq->ref += new_process_refs;
         }
   }
   
@@@ -2116,12 -2113,7 +2113,7 @@@ static void choose_service_tree(struct 
         unsigned count;
         struct cfq_rb_root *st;
         unsigned group_slice;
- 
-       if (!cfqg) {
-               cfqd->serving_prio = IDLE_WORKLOAD;
-               cfqd->workload_expires = jiffies + 1;
-               return;
-       }
+       enum wl_prio_t original_prio = cfqd->serving_prio;
   
         /* Choose next priority. RT > BE > IDLE */
         if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
@@@ -2134,6 -2126,9 +2126,9 @@@
                 return;
         }
   
+       if (original_prio != cfqd->serving_prio)
+               goto new_workload;
+ 
         /*
          * For RT and BE, we have to choose also the type
          * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
@@@ -2148,6 -2143,7 +2143,7 @@@
         if (count && !time_after(jiffies, cfqd->workload_expires))
                 return;
   
+ new_workload:
         /* otherwise select new workload type */
         cfqd->serving_type =
                 cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
@@@ -2199,7 -2195,6 +2195,6 @@@ static struct cfq_group *cfq_get_next_c
         if (RB_EMPTY_ROOT(&st->rb))
                 return NULL;
         cfqg = cfq_rb_first_group(st);
-       st->active = &cfqg->rb_node;
         update_min_vdisktime(st);
         return cfqg;
   }
@@@ -2293,6 -2288,17 +2288,17 @@@ static struct cfq_queue *cfq_select_que
                 goto keep_queue;
         }
   
+       /*
+        * This is a deep seek queue, but the device is much faster than
+        * the queue can deliver, don't idle
+        **/
+       if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
+           (cfq_cfqq_slice_new(cfqq) ||
+           (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
+               cfq_clear_cfqq_deep(cfqq);
+               cfq_clear_cfqq_idle_window(cfqq);
+       }
+ 
         if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
                 cfqq = NULL;
                 goto keep_queue;
@@@ -2367,12 -2373,12 +2373,12 @@@ static inline bool cfq_slice_used_soon(
   {
         /* the queue hasn't finished any request, can't estimate */
         if (cfq_cfqq_slice_new(cfqq))
-               return 1;
+               return true;
         if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
                 cfqq->slice_end))
-               return 1;
+               return true;
   
-       return 0;
+       return false;
   }
   
   static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
@@@ -2538,9 -2544,10 +2544,10 @@@ static void cfq_put_queue(struct cfq_qu
         struct cfq_data *cfqd = cfqq->cfqd;
         struct cfq_group *cfqg, *orig_cfqg;
   
-       BUG_ON(atomic_read(&cfqq->ref) <= 0);
+       BUG_ON(cfqq->ref <= 0);
   
-       if (!atomic_dec_and_test(&cfqq->ref))
+       cfqq->ref--;
+       if (cfqq->ref)
                 return;
   
         cfq_log_cfqq(cfqd, cfqq, "put_queue");
@@@ -2843,7 -2850,7 +2850,7 @@@ static void cfq_init_cfqq(struct cfq_da
         RB_CLEAR_NODE(&cfqq->p_node);
         INIT_LIST_HEAD(&cfqq->fifo);
   
-       atomic_set(&cfqq->ref, 0);
+       cfqq->ref = 0;
         cfqq->cfqd = cfqd;
   
         cfq_mark_cfqq_prio_changed(cfqq);
@@@ -2979,11 -2986,11 +2986,11 @@@ cfq_get_queue(struct cfq_data *cfqd, bo
          * pin the queue now that it's allocated, scheduler exit will prune it
          */
         if (!is_sync && !(*async_cfqq)) {
-               atomic_inc(&cfqq->ref);
+               cfqq->ref++;
                 *async_cfqq = cfqq;
         }
   
-       atomic_inc(&cfqq->ref);
+       cfqq->ref++;
         return cfqq;
   }
   
@@@ -3265,6 -3272,10 +3272,10 @@@ cfq_should_preempt(struct cfq_data *cfq
         if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
                 return true;
   
+       /* An idle queue should not be idle now for some reason */
+       if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
+               return true;
+ 
         if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
                 return false;
   
@@@ -3284,9 -3295,18 +3295,18 @@@
    */
   static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
   {
+       struct cfq_queue *old_cfqq = cfqd->active_queue;
+ 
         cfq_log_cfqq(cfqd, cfqq, "preempt");
         cfq_slice_expired(cfqd, 1);
   
+       /*
+        * workload type is changed, don't save slice, otherwise preempt
+        * doesn't happen
+        */
+       if (cfqq_type(old_cfqq) != cfqq_type(cfqq))
+               cfqq->cfqg->saved_workload_slice = 0;
+ 
         /*
          * Put the new queue at the front of the of the current list,
          * so we know that it will be selected next.
@@@ -3412,6 -3432,10 +3432,10 @@@ static bool cfq_should_wait_busy(struc
   {
         struct cfq_io_context *cic = cfqd->active_cic;
   
+       /* If the queue already has requests, don't wait */
+       if (!RB_EMPTY_ROOT(&cfqq->sort_list))
+               return false;
+ 
         /* If there are other queues in the group, don't wait */
         if (cfqq->cfqg->nr_cfqq > 1)
                 return false;
@@@ -3589,12 -3613,12 +3613,12 @@@ static void cfq_put_request(struct requ
   
                 put_io_context(RQ_CIC(rq)->ioc);
   
- -              rq->elevator_private = NULL;
- -              rq->elevator_private2 = NULL;
+ +              rq->elevator_private[0] = NULL;
+ +              rq->elevator_private[1] = NULL;
   
                 /* Put down rq reference on cfqg */
                 cfq_put_cfqg(RQ_CFQG(rq));
- -              rq->elevator_private3 = NULL;
+ +              rq->elevator_private[2] = NULL;
   
                 cfq_put_queue(cfqq);
         }
@@@ -3681,13 -3705,13 +3705,13 @@@ new_queue
         }
   
         cfqq->allocated[rw]++;
-       atomic_inc(&cfqq->ref);
- -      cfqq->ref++;
- -      rq->elevator_private = cic;
- -      rq->elevator_private2 = cfqq;
- -      rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
   
         spin_unlock_irqrestore(q->queue_lock, flags);
   
++      cfqq->ref++;
+ +      rq->elevator_private[0] = cic;
+ +      rq->elevator_private[1] = cfqq;
+ +      rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
         return 0;
   
   queue_fail:
@@@ -3862,6 -3886,10 +3886,10 @@@ static void *cfq_init_queue(struct requ
         if (!cfqd)
                 return NULL;
   
+       /*
+        * Don't need take queue_lock in the routine, since we are
+        * initializing the ioscheduler, and nobody is using cfqd
+        */
         cfqd->cic_index = i;
   
         /* Init root service tree */
@@@ -3881,7 -3909,7 +3909,7 @@@
          * Take a reference to root group which we never drop. This is just
          * to make sure that cfq_put_cfqg() does not try to kfree root group
          */
-       atomic_set(&cfqg->ref, 1);
+       cfqg->ref = 1;
         rcu_read_lock();
         cfq_blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg,
                                         (void *)cfqd, 0);
@@@ -3901,7 -3929,7 +3929,7 @@@
          * will not attempt to free it.
          */
         cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
-       atomic_inc(&cfqd->oom_cfqq.ref);
+       cfqd->oom_cfqq.ref++;
         cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
   
         INIT_LIST_HEAD(&cfqd->cic_list);
diff --combined include/linux/blkdev.h

index 12bb426949e9f68a42c5b505db6c63650d9b6128,4d18ff34670a4a882e5d08e83b1633ecd2973610..e3ee74fc59030fe6b4c2c15bc8c4ef85a03e0096
--- 1/include/linux/blkdev.h
--- 2/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@@ -108,19 -108,14 +108,20 @@@ struct request 
   
         /*
          * Three pointers are available for the IO schedulers, if they need
- -       * more they have to dynamically allocate it.
+ +       * more they have to dynamically allocate it.  Flush requests are
+ +       * never put on the IO scheduler. So let the flush fields share
+ +       * space with the three elevator_private pointers.
          */
- -      void *elevator_private;
- -      void *elevator_private2;
- -      void *elevator_private3;
+ +      union {
+ +              void *elevator_private[3];
+ +              struct {
+ +                      unsigned int            seq;
+ +                      struct list_head        list;
+ +              } flush;
+ +      };
   
         struct gendisk *rq_disk;
+       struct hd_struct *part;
         unsigned long start_time;
   #ifdef CONFIG_BLK_CGROUP
         unsigned long long start_time_ns;
@@@ -368,12 -363,11 +369,12 @@@ struct request_queu
          * for flush operations
          */
         unsigned int            flush_flags;
- -      unsigned int            flush_seq;
- -      int                     flush_err;
+ +      unsigned int            flush_pending_idx:1;
+ +      unsigned int            flush_running_idx:1;
+ +      unsigned long           flush_pending_since;
+ +      struct list_head        flush_queue[2];
+ +      struct list_head        flush_data_in_flight;
         struct request          flush_rq;
- -      struct request          *orig_flush_rq;
- -      struct list_head        pending_flushes;
   
         struct mutex            sysfs_lock;
   
@@@ -653,7 -647,6 +654,6 @@@ static inline void rq_flush_dcache_page
   
   extern int blk_register_queue(struct gendisk *disk);
   extern void blk_unregister_queue(struct gendisk *disk);
- extern void register_disk(struct gendisk *dev);
   extern void generic_make_request(struct bio *bio);
   extern void blk_rq_init(struct request_queue *q, struct request *rq);
   extern void blk_put_request(struct request *);
@@@ -1263,6 -1256,9 +1263,9 @@@ struct block_device_operations 
         int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
         int (*direct_access) (struct block_device *, sector_t,
                                                 void **, unsigned long *);
+       unsigned int (*check_events) (struct gendisk *disk,
+                                     unsigned int clearing);
+       /* ->media_changed() is DEPRECATED, use ->check_events() instead */
         int (*media_changed) (struct gendisk *);
         void (*unlock_native_capacity) (struct gendisk *);
         int (*revalidate_disk) (struct gendisk *);
diff --combined include/linux/elevator.h

index 86120c916fcc05c2190db55b0a8e0b350610e3ab,4d857973d2c94317cf11041a4a7070794fc13a99..39b68edb388d34df1d0988382cd8e825d3fc66d7
--- 1/include/linux/elevator.h
--- 2/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@@ -167,7 -167,6 +167,7 @@@ extern struct request *elv_rb_find(stru
   #define ELEVATOR_INSERT_BACK  2
   #define ELEVATOR_INSERT_SORT  3
   #define ELEVATOR_INSERT_REQUEUE       4
+ +#define ELEVATOR_INSERT_FLUSH 5
   
   /*
    * return values from elevator_may_queue_fn
@@@ -196,15 -195,9 +196,9 @@@ enum 
   /*
    * io context count accounting
    */
- #define elv_ioc_count_mod(name, __val)                                \
-       do {                                                    \
-               preempt_disable();                              \
-               __get_cpu_var(name) += (__val);                 \
-               preempt_enable();                               \
-       } while (0)
- 
- #define elv_ioc_count_inc(name)       elv_ioc_count_mod(name, 1)
- #define elv_ioc_count_dec(name)       elv_ioc_count_mod(name, -1)
+ #define elv_ioc_count_mod(name, __val) this_cpu_add(name, __val)
+ #define elv_ioc_count_inc(name)       this_cpu_inc(name)
+ #define elv_ioc_count_dec(name)       this_cpu_dec(name)
   
   #define elv_ioc_count_read(name)                              \
   ({                                                            \
author	Jens Axboe <jaxboe@fusionio.com>
	Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)
committer	Jens Axboe <jaxboe@fusionio.com>
	Tue, 1 Mar 2011 20:04:39 +0000 (15:04 -0500)
		1	2
block/blk-core.c	patch \|	diff1 \|	diff2 \|	blob \| history
block/cfq-iosched.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/blkdev.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/elevator.h	patch \|	diff1 \|	diff2 \|	blob \| history