sched: Energy-aware wake-up task placement
authorMorten Rasmussen <morten.rasmussen@arm.com>
Sat, 9 May 2015 19:03:19 +0000 (20:03 +0100)
committerPunit Agrawal <punit.agrawal@arm.com>
Mon, 21 Mar 2016 12:34:30 +0000 (12:34 +0000)
Let available compute capacity and estimated energy impact select
wake-up target cpu when energy-aware scheduling is enabled and the
system in not over-utilized (above the tipping point).

energy_aware_wake_cpu() attempts to find group of cpus with sufficient
compute capacity to accommodate the task and find a cpu with enough spare
capacity to handle the task within that group. Preference is given to
cpus with enough spare capacity at the current OPP. Finally, the energy
impact of the new target and the previous task cpu is compared to select
the wake-up target cpu.

cc: Ingo Molnar <mingo@redhat.com>
cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Morten Rasmussen <morten.rasmussen@arm.com>
kernel/sched/fair.c

index c11bc73929163a879d5b94d638833dcc0c550f79..682b4ae9ebd76feb140be803ef4cf8d6da31d9bf 100644 (file)
@@ -5287,6 +5287,86 @@ done:
        return target;
 }
 
+static int energy_aware_wake_cpu(struct task_struct *p, int target)
+{
+       struct sched_domain *sd;
+       struct sched_group *sg, *sg_target;
+       int target_max_cap = INT_MAX;
+       int target_cpu = task_cpu(p);
+       int i;
+
+       sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
+
+       if (!sd)
+               return target;
+
+       sg = sd->groups;
+       sg_target = sg;
+
+       /*
+        * Find group with sufficient capacity. We only get here if no cpu is
+        * overutilized. We may end up overutilizing a cpu by adding the task,
+        * but that should not be any worse than select_idle_sibling().
+        * load_balance() should sort it out later as we get above the tipping
+        * point.
+        */
+       do {
+               /* Assuming all cpus are the same in group */
+               int max_cap_cpu = group_first_cpu(sg);
+
+               /*
+                * Assume smaller max capacity means more energy-efficient.
+                * Ideally we should query the energy model for the right
+                * answer but it easily ends up in an exhaustive search.
+                */
+               if (capacity_of(max_cap_cpu) < target_max_cap &&
+                   task_fits_max(p, max_cap_cpu)) {
+                       sg_target = sg;
+                       target_max_cap = capacity_of(max_cap_cpu);
+               }
+       } while (sg = sg->next, sg != sd->groups);
+
+       /* Find cpu with sufficient capacity */
+       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+               /*
+                * p's blocked utilization is still accounted for on prev_cpu
+                * so prev_cpu will receive a negative bias due to the double
+                * accounting. However, the blocked utilization may be zero.
+                */
+               int new_util = cpu_util(i) + task_util(p);
+
+               if (new_util > capacity_orig_of(i))
+                       continue;
+
+               if (new_util < capacity_curr_of(i)) {
+                       target_cpu = i;
+                       if (cpu_rq(i)->nr_running)
+                               break;
+               }
+
+               /* cpu has capacity at higher OPP, keep it as fallback */
+               if (target_cpu == task_cpu(p))
+                       target_cpu = i;
+       }
+
+       if (target_cpu != task_cpu(p)) {
+               struct energy_env eenv = {
+                       .util_delta     = task_util(p),
+                       .src_cpu        = task_cpu(p),
+                       .dst_cpu        = target_cpu,
+               };
+
+               /* Not enough spare capacity on previous cpu */
+               if (cpu_overutilized(task_cpu(p)))
+                       return target_cpu;
+
+               if (energy_diff(&eenv) >= 0)
+                       return task_cpu(p);
+       }
+
+       return target_cpu;
+}
+
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5309,8 +5389,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        int sync = wake_flags & WF_SYNC;
 
        if (sd_flag & SD_BALANCE_WAKE)
-               want_affine = !wake_wide(p) && task_fits_max(p, cpu) &&
-                             cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+               want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
+                             cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
+                             energy_aware();
 
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
@@ -5340,7 +5421,9 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        }
 
        if (!sd) {
-               if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
+               if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
+                       new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+               else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                        new_cpu = select_idle_sibling(p, new_cpu);
 
        } else while (sd) {