max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
-/*
- * Task first catches up with cfs_rq, and then subtract
- * itself from the cfs_rq (task must be off the queue now).
- */
-void remove_entity_load_avg(struct sched_entity *se)
-{
- struct cfs_rq *cfs_rq = cfs_rq_of(se);
- u64 last_update_time;
-
#ifndef CONFIG_64BIT
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
u64 last_update_time_copy;
+ u64 last_update_time;
do {
last_update_time_copy = cfs_rq->load_last_update_time_copy;
smp_rmb();
last_update_time = cfs_rq->avg.last_update_time;
} while (last_update_time != last_update_time_copy);
+
+ return last_update_time;
+}
#else
- last_update_time = cfs_rq->avg.last_update_time;
+static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->avg.last_update_time;
+}
#endif
+/*
+ * Task first catches up with cfs_rq, and then subtract
+ * itself from the cfs_rq (task must be off the queue now).
+ */
+void remove_entity_load_avg(struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 last_update_time;
+
+ /*
+ * Newly created task or never used group entity should not be removed
+ * from its (source) cfs_rq
+ */
+ if (se->avg.last_update_time == 0)
+ return;
+
+ last_update_time = cfs_rq_last_update_time(cfs_rq);
+
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
#endif
+/*
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+static unsigned long capacity_curr_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig *
+ arch_scale_freq_capacity(NULL, cpu)
+ >> SCHED_CAPACITY_SHIFT;
+}
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static unsigned long __cpu_util(int cpu, int delta)
+{
+ unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ delta += util;
+ if (delta < 0)
+ return 0;
+
+ return (delta >= capacity) ? capacity : delta;
+}
+
+static unsigned long cpu_util(int cpu)
+{
+ return __cpu_util(cpu, 0);
+}
+
+static inline bool energy_aware(void)
+{
+ return sched_feat(ENERGY_AWARE);
+}
+
+struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int cap_idx;
+ int util_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+};
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
+ * energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
+ *
+ * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
+ *
+ * the normalized util can be found using the specific capacity.
+ *
+ * capacity = capacity_orig * curr_freq/max_freq
+ *
+ * norm_util = running_time/time ~ util/capacity
+ */
+static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta)
+{
+ int util = __cpu_util(cpu, delta);
+
+ if (util >= capacity)
+ return SCHED_CAPACITY_SCALE;
+
+ return (util << SCHED_CAPACITY_SHIFT)/capacity;
+}
+
+static int calc_util_delta(struct energy_env *eenv, int cpu)
+{
+ if (cpu == eenv->src_cpu)
+ return -eenv->util_delta;
+ if (cpu == eenv->dst_cpu)
+ return eenv->util_delta;
+ return 0;
+}
+
+static
+unsigned long group_max_util(struct energy_env *eenv)
+{
+ int i, delta;
+ unsigned long max_util = 0;
+
+ for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) {
+ delta = calc_util_delta(eenv, i);
+ max_util = max(max_util, __cpu_util(i, delta));
+ }
+
+ return max_util;
+}
+
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in
+ * energy calculations. Since task executions may or may not overlap in time in
+ * the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
+{
+ int i, delta;
+ unsigned long util_sum = 0;
+ unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
+
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ delta = calc_util_delta(eenv, i);
+ util_sum += __cpu_norm_util(i, capacity, delta);
+ }
+
+ if (util_sum > SCHED_CAPACITY_SCALE)
+ return SCHED_CAPACITY_SCALE;
+ return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv,
+ const struct sched_group_energy const *sge)
+{
+ int idx;
+ unsigned long util = group_max_util(eenv);
+
+ for (idx = 0; idx < sge->nr_cap_states; idx++) {
+ if (sge->cap_states[idx].cap >= util)
+ break;
+ }
+
+ eenv->cap_idx = idx;
+
+ return idx;
+}
+
+/*
+ * sched_group_energy(): Computes the absolute energy consumption of cpus
+ * belonging to the sched_group including shared resources shared only by
+ * members of the group. Iterates over all cpus in the hierarchy below the
+ * sched_group starting from the bottom working it's way up before going to
+ * the next cpu until all cpus are covered at all levels. The current
+ * implementation is likely to gather the same util statistics multiple times.
+ * This can probably be done in a faster but more complex way.
+ * Note: sched_group_energy() may fail when racing with sched_domain updates.
+ */
+static int sched_group_energy(struct energy_env *eenv)
+{
+ struct sched_domain *sd;
+ int cpu, total_energy = 0;
+ struct cpumask visit_cpus;
+ struct sched_group *sg;
+
+ WARN_ON(!eenv->sg_top->sge);
+
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
+
+ while (!cpumask_empty(&visit_cpus)) {
+ struct sched_group *sg_shared_cap = NULL;
+
+ cpu = cpumask_first(&visit_cpus);
+
+ /*
+ * Is the group utilization affected by cpus outside this
+ * sched_group?
+ */
+ sd = rcu_dereference(per_cpu(sd_scs, cpu));
+
+ if (!sd)
+ /*
+ * We most probably raced with hotplug; returning a
+ * wrong energy estimation is better than entering an
+ * infinite loop.
+ */
+ return -EINVAL;
+
+ if (sd->parent)
+ sg_shared_cap = sd->parent->groups;
+
+ for_each_domain(cpu, sd) {
+ sg = sd->groups;
+
+ /* Has this sched_domain already been visited? */
+ if (sd->child && group_first_cpu(sg) != cpu)
+ break;
+
+ do {
+ unsigned long group_util;
+ int sg_busy_energy, sg_idle_energy, cap_idx;
+
+ if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+ eenv->sg_cap = sg_shared_cap;
+ else
+ eenv->sg_cap = sg;
+
+ cap_idx = find_new_capacity(eenv, sg->sge);
+ group_util = group_norm_util(eenv, sg);
+ sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
+ >> SCHED_CAPACITY_SHIFT;
+ sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power)
+ >> SCHED_CAPACITY_SHIFT;
+
+ total_energy += sg_busy_energy + sg_idle_energy;
+
+ if (!sd->child)
+ cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
+
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
+ goto next_cpu;
+
+ } while (sg = sg->next, sg != sd->groups);
+ }
+next_cpu:
+ continue;
+ }
+
+ eenv->energy = total_energy;
+ return 0;
+}
+
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
* A waker of many should wake a different task than the one last awakened
return 1;
}
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+static unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ unsigned long capacity = capacity_of(cpu);
+ unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ if (capacity == max_capacity)
+ return true;
+
+ if (capacity * capacity_margin > max_capacity * 1024)
+ return true;
+
+ return __task_fits(p, cpu, 0);
+}
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+ return __task_fits(p, cpu, cpu_util(cpu));
+}
+
+static bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
+ struct sched_group *fit_group = NULL, *spare_group = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
+ unsigned long fit_capacity = ULONG_MAX;
+ unsigned long max_spare_capacity = capacity_margin - SCHED_LOAD_SCALE;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
load_idx = sd->wake_idx;
do {
- unsigned long load, avg_load;
+ unsigned long load, avg_load, spare_capacity;
int local_group;
int i;
load = target_load(i, load_idx);
avg_load += load;
+
+ /*
+ * Look for most energy-efficient group that can fit
+ * that can fit the task.
+ */
+ if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+ fit_capacity = capacity_of(i);
+ fit_group = group;
+ }
+
+ /*
+ * Look for group which has most spare capacity on a
+ * single cpu.
+ */
+ spare_capacity = capacity_of(i) - cpu_util(i);
+ if (spare_capacity > max_spare_capacity) {
+ max_spare_capacity = spare_capacity;
+ spare_group = group;
+ }
}
/* Adjust by relative CPU capacity of the group */
}
} while (group = group->next, group != sd->groups);
+ if (fit_group)
+ return fit_group;
+
+ if (spare_group)
+ return spare_group;
+
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
- if (idle_cpu(i)) {
+ if (task_fits_spare(p, i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
- } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+ } else if (idle_cpu(i) &&
+ (!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
+ } else if (shallowest_idle_cpu == -1) {
+ /*
+ * If we haven't found an idle CPU yet
+ * pick a non-idle one that can fit the task as
+ * fallback.
+ */
+ shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
return target;
}
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
-{
- unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
- unsigned long capacity = capacity_orig_of(cpu);
-
- return (util >= capacity) ? capacity : util;
-}
-
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
int sync = wake_flags & WF_SYNC;
if (sd_flag & SD_BALANCE_WAKE)
- want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+ want_affine = !wake_wide(p) && task_fits_max(p, cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
rcu_read_lock();
for_each_domain(cpu, tmp) {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
+ unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
+ env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
return 1;
}
+ if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+ env->src_rq->cfs.h_nr_running == 1 &&
+ cpu_overutilized(env->src_cpu) &&
+ !cpu_overutilized(env->dst_cpu)) {
+ return 1;
+ }
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
- sd->nr_balance_failed++;
+ if (env.src_grp_nr_running > 1)
+ sd->nr_balance_failed++;
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);