sched: Extend sched_group_energy to test load-balancing decisions
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index 90e26b11deaa1ab4b78302605850523a7852720b..f3de37414a664e69f3ab61003676ff27289f011c 100644 (file)
@@ -2689,7 +2689,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
        int decayed, removed = 0;
 
        if (atomic_long_read(&cfs_rq->removed_load_avg)) {
-               long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
+               s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
                sa->load_avg = max_t(long, sa->load_avg - r, 0);
                sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
                removed = 1;
@@ -2809,27 +2809,45 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
                max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
 }
 
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
-       struct cfs_rq *cfs_rq = cfs_rq_of(se);
-       u64 last_update_time;
-
 #ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
        u64 last_update_time_copy;
+       u64 last_update_time;
 
        do {
                last_update_time_copy = cfs_rq->load_last_update_time_copy;
                smp_rmb();
                last_update_time = cfs_rq->avg.last_update_time;
        } while (last_update_time != last_update_time_copy);
+
+       return last_update_time;
+}
 #else
-       last_update_time = cfs_rq->avg.last_update_time;
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+       return cfs_rq->avg.last_update_time;
+}
 #endif
 
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+       struct cfs_rq *cfs_rq = cfs_rq_of(se);
+       u64 last_update_time;
+
+       /*
+        * Newly created task or never used group entity should not be removed
+        * from its (source) cfs_rq
+        */
+       if (se->avg.last_update_time == 0)
+               return;
+
+       last_update_time = cfs_rq_last_update_time(cfs_rq);
+
        __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
        atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
        atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
@@ -4633,6 +4651,248 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
 #endif
 
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+static unsigned long capacity_curr_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity_orig *
+              arch_scale_freq_capacity(NULL, cpu)
+              >> SCHED_CAPACITY_SHIFT;
+}
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static unsigned long __cpu_util(int cpu, int delta)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+       delta += util;
+       if (delta < 0)
+               return 0;
+
+       return (delta >= capacity) ? capacity : delta;
+}
+
+static unsigned long cpu_util(int cpu)
+{
+       return __cpu_util(cpu, 0);
+}
+
+static inline bool energy_aware(void)
+{
+       return sched_feat(ENERGY_AWARE);
+}
+
+struct energy_env {
+       struct sched_group      *sg_top;
+       struct sched_group      *sg_cap;
+       int                     cap_idx;
+       int                     util_delta;
+       int                     src_cpu;
+       int                     dst_cpu;
+       int                     energy;
+};
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ *   capacity = capacity_orig * curr_freq/max_freq
+ *
+ *   norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+       int util = __cpu_util(cpu, delta);
+
+       if (util >= capacity)
+               return SCHED_CAPACITY_SCALE;
+
+       return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+       if (cpu == eenv->src_cpu)
+               return -eenv->util_delta;
+       if (cpu == eenv->dst_cpu)
+               return eenv->util_delta;
+       return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+       int i, delta;
+       unsigned long max_util = 0;
+
+       for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+               delta = calc_util_delta(eenv, i);
+               max_util = max(max_util, __cpu_util(i, delta));
+       }
+
+       return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+       int i, delta;
+       unsigned long util_sum = 0;
+       unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+       for_each_cpu(i, sched_group_cpus(sg)) {
+               delta = calc_util_delta(eenv, i);
+               util_sum += __cpu_norm_util(i, capacity, delta);
+       }
+
+       if (util_sum > SCHED_CAPACITY_SCALE)
+               return SCHED_CAPACITY_SCALE;
+       return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+       const struct sched_group_energy const *sge)
+{
+       int idx;
+       unsigned long util = group_max_util(eenv);
+
+       for (idx = 0; idx < sge->nr_cap_states; idx++) {
+               if (sge->cap_states[idx].cap >= util)
+                       break;
+       }
+
+       eenv->cap_idx = idx;
+
+       return idx;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+       struct sched_domain *sd;
+       int cpu, total_energy = 0;
+       struct cpumask visit_cpus;
+       struct sched_group *sg;
+
+       WARN_ON(!eenv->sg_top->sge);
+
+       cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+
+       while (!cpumask_empty(&visit_cpus)) {
+               struct sched_group *sg_shared_cap = NULL;
+
+               cpu = cpumask_first(&visit_cpus);
+
+               /*
+                * Is the group utilization affected by cpus outside this
+                * sched_group?
+                */
+               sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+               if (!sd)
+                       /*
+                        * We most probably raced with hotplug; returning a
+                        * wrong energy estimation is better than entering an
+                        * infinite loop.
+                        */
+                       return -EINVAL;
+
+               if (sd->parent)
+                       sg_shared_cap = sd->parent->groups;
+
+               for_each_domain(cpu, sd) {
+                       sg = sd->groups;
+
+                       /* Has this sched_domain already been visited? */
+                       if (sd->child && group_first_cpu(sg) != cpu)
+                               break;
+
+                       do {
+                               unsigned long group_util;
+                               int sg_busy_energy, sg_idle_energy, cap_idx;
+
+                               if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+                                       eenv->sg_cap = sg_shared_cap;
+                               else
+                                       eenv->sg_cap = sg;
+
+                               cap_idx = find_new_capacity(eenv, sg->sge);
+                               group_util = group_norm_util(eenv, sg);
+                               sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+                                                                               >> SCHED_CAPACITY_SHIFT;
+                               sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power)
+                                                                               >> SCHED_CAPACITY_SHIFT;
+
+                               total_energy += sg_busy_energy + sg_idle_energy;
+
+                               if (!sd->child)
+                                       cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+                               if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+                                       goto next_cpu;
+
+                       } while (sg = sg->next, sg != sd->groups);
+               }
+next_cpu:
+               continue;
+       }
+
+       eenv->energy = total_energy;
+       return 0;
+}
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  * A waker of many should wake a different task than the one last awakened
@@ -4724,6 +4984,46 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        return 1;
 }
 
+static inline unsigned long task_util(struct task_struct *p)
+{
+       return p->se.avg.util_avg;
+}
+
+static unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+       unsigned long capacity = capacity_of(cpu);
+
+       util += task_util(p);
+
+       return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+       unsigned long capacity = capacity_of(cpu);
+       unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       if (capacity == max_capacity)
+               return true;
+
+       if (capacity * capacity_margin > max_capacity * 1024)
+               return true;
+
+       return __task_fits(p, cpu, 0);
+}
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+       return __task_fits(p, cpu, cpu_util(cpu));
+}
+
+static bool cpu_overutilized(int cpu)
+{
+       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -4733,7 +5033,10 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                  int this_cpu, int sd_flag)
 {
        struct sched_group *idlest = NULL, *group = sd->groups;
+       struct sched_group *fit_group = NULL, *spare_group = NULL;
        unsigned long min_load = ULONG_MAX, this_load = 0;
+       unsigned long fit_capacity = ULONG_MAX;
+       unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
        int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
 
@@ -4741,7 +5044,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                load_idx = sd->wake_idx;
 
        do {
-               unsigned long load, avg_load;
+               unsigned long load, avg_load, spare_capacity;
                int local_group;
                int i;
 
@@ -4764,6 +5067,25 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                                load = target_load(i, load_idx);
 
                        avg_load += load;
+
+                       /*
+                        * Look for most energy-efficient group that can fit
+                        * that can fit the task.
+                        */
+                       if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+                               fit_capacity = capacity_of(i);
+                               fit_group = group;
+                       }
+
+                       /*
+                        * Look for group which has most spare capacity on a
+                        * single cpu.
+                        */
+                       spare_capacity = capacity_of(i) - cpu_util(i);
+                       if (spare_capacity > max_spare_capacity) {
+                               max_spare_capacity = spare_capacity;
+                               spare_group = group;
+                       }
                }
 
                /* Adjust by relative CPU capacity of the group */
@@ -4777,6 +5099,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                }
        } while (group = group->next, group != sd->groups);
 
+       if (fit_group)
+               return fit_group;
+
+       if (spare_group)
+               return spare_group;
+
        if (!idlest || 100*this_load < imbalance*min_load)
                return NULL;
        return idlest;
@@ -4797,7 +5125,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-               if (idle_cpu(i)) {
+               if (task_fits_spare(p, i)) {
                        struct rq *rq = cpu_rq(i);
                        struct cpuidle_state *idle = idle_get_state(rq);
                        if (idle && idle->exit_latency < min_exit_latency) {
@@ -4809,7 +5137,8 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                min_exit_latency = idle->exit_latency;
                                latest_idle_timestamp = rq->idle_stamp;
                                shallowest_idle_cpu = i;
-                       } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+                       } else if (idle_cpu(i) &&
+                                  (!idle || idle->exit_latency == min_exit_latency) &&
                                   rq->idle_stamp > latest_idle_timestamp) {
                                /*
                                 * If equal or no active idle state, then
@@ -4818,6 +5147,13 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                                 */
                                latest_idle_timestamp = rq->idle_stamp;
                                shallowest_idle_cpu = i;
+                       } else if (shallowest_idle_cpu == -1) {
+                               /*
+                                * If we haven't found an idle CPU yet
+                                * pick a non-idle one that can fit the task as
+                                * fallback.
+                                */
+                               shallowest_idle_cpu = i;
                        }
                } else if (shallowest_idle_cpu == -1) {
                        load = weighted_cpuload(i);
@@ -4876,40 +5212,6 @@ done:
        return target;
 }
 
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
-{
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-       unsigned long capacity = capacity_orig_of(cpu);
-
-       return (util >= capacity) ? capacity : util;
-}
-
 /*
  * select_task_rq_fair: Select target runqueue for the waking task in domains
  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -4932,7 +5234,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        int sync = wake_flags & WF_SYNC;
 
        if (sd_flag & SD_BALANCE_WAKE)
-               want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+               want_affine = !wake_wide(p) && task_fits_max(p, cpu) &&
+                             cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
 
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
@@ -5532,6 +5835,7 @@ struct lb_env {
        int                     new_dst_cpu;
        enum cpu_idle_type      idle;
        long                    imbalance;
+       unsigned int            src_grp_nr_running;
        /* The set of CPUs under consideration for load-balancing */
        struct cpumask          *cpus;
 
@@ -6494,6 +6798,8 @@ next_group:
        if (env->sd->flags & SD_NUMA)
                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 
+       env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
        if (!env->sd->parent) {
                /* update overload indicator if we are at root domain */
                if (env->dst_rq->rd->overload != overload)
@@ -6903,6 +7209,13 @@ static int need_active_balance(struct lb_env *env)
                        return 1;
        }
 
+       if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+                               env->src_rq->cfs.h_nr_running == 1 &&
+                               cpu_overutilized(env->src_cpu) &&
+                               !cpu_overutilized(env->dst_cpu)) {
+                       return 1;
+       }
+
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -7115,7 +7428,8 @@ more_balance:
                 * excessive cache_hot migrations and active balances.
                 */
                if (idle != CPU_NEWLY_IDLE)
-                       sd->nr_balance_failed++;
+                       if (env.src_grp_nr_running > 1)
+                               sd->nr_balance_failed++;
 
                if (need_active_balance(&env)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);