sched: Extend sched_group_energy to test load-balancing decisions

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index ea966d9193cd9f045b345ff586c31ec96c3b6c93..f3de37414a664e69f3ab61003676ff27289f011c 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4662,11 +4662,237 @@ static unsigned long capacity_curr_of(int cpu)
                >> SCHED_CAPACITY_SHIFT;
  }
  
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static unsigned long __cpu_util(int cpu, int delta)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+       delta += util;
+       if (delta < 0)
+               return 0;
+
+       return (delta >= capacity) ? capacity : delta;
+}
+
+static unsigned long cpu_util(int cpu)
+{
+       return __cpu_util(cpu, 0);
+}
+
  static inline bool energy_aware(void)
  {
         return sched_feat(ENERGY_AWARE);
  }
  
+struct energy_env {
+       struct sched_group      *sg_top;
+       struct sched_group      *sg_cap;
+       int                     cap_idx;
+       int                     util_delta;
+       int                     src_cpu;
+       int                     dst_cpu;
+       int                     energy;
+};
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ *   capacity = capacity_orig * curr_freq/max_freq
+ *
+ *   norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+       int util = __cpu_util(cpu, delta);
+
+       if (util >= capacity)
+               return SCHED_CAPACITY_SCALE;
+
+       return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+       if (cpu == eenv->src_cpu)
+               return -eenv->util_delta;
+       if (cpu == eenv->dst_cpu)
+               return eenv->util_delta;
+       return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+       int i, delta;
+       unsigned long max_util = 0;
+
+       for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+               delta = calc_util_delta(eenv, i);
+               max_util = max(max_util, __cpu_util(i, delta));
+       }
+
+       return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+       int i, delta;
+       unsigned long util_sum = 0;
+       unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+       for_each_cpu(i, sched_group_cpus(sg)) {
+               delta = calc_util_delta(eenv, i);
+               util_sum += __cpu_norm_util(i, capacity, delta);
+       }
+
+       if (util_sum > SCHED_CAPACITY_SCALE)
+               return SCHED_CAPACITY_SCALE;
+       return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+       const struct sched_group_energy const *sge)
+{
+       int idx;
+       unsigned long util = group_max_util(eenv);
+
+       for (idx = 0; idx < sge->nr_cap_states; idx++) {
+               if (sge->cap_states[idx].cap >= util)
+                       break;
+       }
+
+       eenv->cap_idx = idx;
+
+       return idx;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+       struct sched_domain *sd;
+       int cpu, total_energy = 0;
+       struct cpumask visit_cpus;
+       struct sched_group *sg;
+
+       WARN_ON(!eenv->sg_top->sge);
+
+       cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+
+       while (!cpumask_empty(&visit_cpus)) {
+               struct sched_group *sg_shared_cap = NULL;
+
+               cpu = cpumask_first(&visit_cpus);
+
+               /*
+                * Is the group utilization affected by cpus outside this
+                * sched_group?
+                */
+               sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+               if (!sd)
+                       /*
+                        * We most probably raced with hotplug; returning a
+                        * wrong energy estimation is better than entering an
+                        * infinite loop.
+                        */
+                       return -EINVAL;
+
+               if (sd->parent)
+                       sg_shared_cap = sd->parent->groups;
+
+               for_each_domain(cpu, sd) {
+                       sg = sd->groups;
+
+                       /* Has this sched_domain already been visited? */
+                       if (sd->child && group_first_cpu(sg) != cpu)
+                               break;
+
+                       do {
+                               unsigned long group_util;
+                               int sg_busy_energy, sg_idle_energy, cap_idx;
+
+                               if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+                                       eenv->sg_cap = sg_shared_cap;
+                               else
+                                       eenv->sg_cap = sg;
+
+                               cap_idx = find_new_capacity(eenv, sg->sge);
+                               group_util = group_norm_util(eenv, sg);
+                               sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+                                                                               >> SCHED_CAPACITY_SHIFT;
+                               sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power)
+                                                                               >> SCHED_CAPACITY_SHIFT;
+
+                               total_energy += sg_busy_energy + sg_idle_energy;
+
+                               if (!sd->child)
+                                       cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+                               if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+                                       goto next_cpu;
+
+                       } while (sg = sg->next, sg != sd->groups);
+               }
+next_cpu:
+               continue;
+       }
+
+       eenv->energy = total_energy;
+       return 0;
+}
+
  /*
   * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
   * A waker of many should wake a different task than the one last awakened
@@ -4788,8 +5014,6 @@ static inline bool task_fits_max(struct task_struct *p, int cpu)
         return __task_fits(p, cpu, 0);
  }
  
-static int cpu_util(int cpu);
-
  static inline bool task_fits_spare(struct task_struct *p, int cpu)
  {
         return __task_fits(p, cpu, cpu_util(cpu));
@@ -4988,40 +5212,6 @@ done:
         return target;
  }
  
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
-{
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-       unsigned long capacity = capacity_orig_of(cpu);
-
-       return (util >= capacity) ? capacity : util;
-}
-
  /*
   * select_task_rq_fair: Select target runqueue for the waking task in domains
   * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,