#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/module.h>
#include <trace/events/sched.h>
#include "sched.h"
#include "tune.h"
+#include "walt.h"
/*
* Targeted preemption latency for CPU-bound tasks:
unsigned int sysctl_sched_latency = 6000000ULL;
unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+ (10 * NSEC_PER_MSEC);
+#endif
/*
* The initial- and re-scaling of tunables is configurable
* (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+ sa->util_avg = sched_freq() ?
+ sysctl_sched_initial_task_util :
+ scale_load_down(SCHED_LOAD_SCALE);
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
-static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
-static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
+ /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+ if (cfs_rq == &rq_of(cfs_rq)->cfs)
+ trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
return decayed || removed;
}
if (entity_is_task(se))
trace_sched_load_avg_task(task_of(se), &se->avg);
- trace_sched_load_avg_cpu(cpu, cfs_rq);
}
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
#endif
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+#ifdef CONFIG_SMP
static void update_capacity_of(int cpu)
{
unsigned long req_cap;
req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
set_cfs_cpu_capacity(cpu, true, req_cap);
}
-
-static bool cpu_overutilized(int cpu);
+#endif
/*
* The enqueue_task method is called before nr_running is
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+#ifdef CONFIG_SMP
int task_new = flags & ENQUEUE_WAKEUP_NEW;
int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
for_each_sched_entity(se) {
if (se->on_rq)
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
flags = ENQUEUE_WAKEUP;
}
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running++;
+ walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
update_cfs_shares(cfs_rq);
}
- if (!se) {
+ if (!se)
add_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting.
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_inc_cumulative_runnable_avg(rq, p);
if (!task_new && !rq->rd->overutilized &&
- cpu_overutilized(rq->cpu))
+ cpu_overutilized(rq->cpu)) {
rq->rd->overutilized = true;
-
- schedtune_enqueue_task(p, cpu_of(rq));
+ trace_sched_overutilized(true);
+ }
/*
* We want to potentially trigger a freq switch
if (task_new || task_wakeup)
update_capacity_of(cpu_of(rq));
}
+
+#endif /* CONFIG_SMP */
hrtick_update(rq);
}
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_nr_running--;
+ walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
if (cfs_rq_throttled(cfs_rq))
break;
update_cfs_shares(cfs_rq);
}
- if (!se) {
+ if (!se)
sub_nr_running(rq, 1);
- schedtune_dequeue_task(p, cpu_of(rq));
+
+#ifdef CONFIG_SMP
+
+ /*
+ * Update SchedTune accounting
+ *
+ * We do it before updating the CPU capacity to ensure the
+ * boost value of the current task is accounted for in the
+ * selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
+ if (!se) {
+ walt_dec_cumulative_runnable_avg(rq, p);
/*
* We want to potentially trigger a freq switch
set_cfs_cpu_capacity(cpu_of(rq), false, 0);
}
}
+
+#endif /* CONFIG_SMP */
+
hrtick_update(rq);
}
return wl;
for_each_sched_entity(se) {
- long w, W;
+ struct cfs_rq *cfs_rq = se->my_q;
+ long W, w = cfs_rq_load_avg(cfs_rq);
- tg = se->my_q->tg;
+ tg = cfs_rq->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + calc_tg_weight(tg, se->my_q);
+ W = wg + atomic_long_read(&tg->load_avg);
+
+ /* Ensure \Sum rw_j >= rw_i */
+ W -= cfs_rq->tg_load_avg_contrib;
+ W += w;
/*
* w = rw_i + @wl
*/
- w = cfs_rq_load_avg(se->my_q) + wl;
+ w += wl;
/*
* wl = S * s'_i; see (2)
} while (sg = sg->next, sg != sd->groups);
}
next_cpu:
+ cpumask_clear_cpu(cpu, &visit_cpus);
continue;
}
return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
}
-#ifdef CONFIG_SCHED_TUNE
-static int energy_diff_evaluate(struct energy_env *eenv)
-{
- unsigned int boost;
- int nrg_delta;
-
- /* Return energy diff when boost margin is 0 */
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- boost = schedtune_task_boost(eenv->task);
-#else
- boost = get_sysctl_sched_cfs_boost();
-#endif
- if (boost == 0)
- return eenv->nrg.diff;
-
- /* Compute normalized energy diff */
- nrg_delta = schedtune_normalize_energy(eenv->nrg.diff);
- eenv->nrg.delta = nrg_delta;
-
- eenv->payoff = schedtune_accept_deltas(
- eenv->nrg.delta,
- eenv->cap.delta,
- eenv->task);
-
- /*
- * When SchedTune is enabled, the energy_diff() function will return
- * the computed energy payoff value. Since the energy_diff() return
- * value is expected to be negative by its callers, this evaluation
- * function return a negative value each time the evaluation return a
- * positive payoff, which is the condition for the acceptance of
- * a scheduling decision
- */
- return -eenv->payoff;
-}
-#else /* CONFIG_SCHED_TUNE */
-#define energy_diff_evaluate(eenv) eenv->nrg.diff
-#endif
-
/*
* energy_diff(): Estimate the energy impact of changing the utilization
* distribution. eenv specifies the change: utilisation amount, source, and
* utilization is removed from or added to the system (e.g. task wake-up). If
* both are specified, the utilization is migrated.
*/
-static int energy_diff(struct energy_env *eenv)
+static inline int __energy_diff(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
eenv->payoff = 0;
- return energy_diff_evaluate(eenv);
+ trace_sched_energy_diff(eenv->task,
+ eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+ eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+ eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+ eenv->nrg.delta, eenv->payoff);
+
+ return eenv->nrg.diff;
+}
+
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+ u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+ int max_delta;
+
+ /* Check for boundaries */
+ max_delta = schedtune_target_nrg.max_power;
+ max_delta -= schedtune_target_nrg.min_power;
+ WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+ /* Do scaling using positive numbers to increase the range */
+ normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+ /* Scale by energy magnitude */
+ normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+ /* Normalize on max energy for target platform */
+ normalized_nrg = reciprocal_divide(
+ normalized_nrg, schedtune_target_nrg.rdiv);
+
+ return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+ int boost = schedtune_task_boost(eenv->task);
+ int nrg_delta;
+
+ /* Conpute "absolute" energy diff */
+ __energy_diff(eenv);
+
+ /* Return energy diff when boost margin is 0 */
+ if (boost == 0)
+ return eenv->nrg.diff;
+
+ /* Compute normalized energy diff */
+ nrg_delta = normalize_energy(eenv->nrg.diff);
+ eenv->nrg.delta = nrg_delta;
+
+ eenv->payoff = schedtune_accept_deltas(
+ eenv->nrg.delta,
+ eenv->cap.delta,
+ eenv->task);
+
+ /*
+ * When SchedTune is enabled, the energy_diff() function will return
+ * the computed energy payoff value. Since the energy_diff() return
+ * value is expected to be negative by its callers, this evaluation
+ * function return a negative value each time the evaluation return a
+ * positive payoff, which is the condition for the acceptance of
+ * a scheduling decision
+ */
+ return -eenv->payoff;
}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
/*
* Detect M:N waker/wakee relationships via a switching-frequency heuristic.
static inline unsigned long task_util(struct task_struct *p)
{
+#ifdef CONFIG_SCHED_WALT
+ if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+ unsigned long demand = p->ravg.demand;
+ return (demand << 10) / walt_ravg_window;
+ }
+#endif
return p->se.avg.util_avg;
}
#ifdef CONFIG_SCHED_TUNE
-static unsigned long
-schedtune_margin(unsigned long signal, unsigned long boost)
+static long
+schedtune_margin(unsigned long signal, long boost)
{
- unsigned long long margin = 0;
+ long long margin = 0;
/*
* Signal proportional compensation (SPC)
*
* The Boost (B) value is used to compute a Margin (M) which is
* proportional to the complement of the original Signal (S):
- * M = B * (SCHED_LOAD_SCALE - S)
+ * M = B * (SCHED_LOAD_SCALE - S), if B is positive
+ * M = B * S, if B is negative
* The obtained M could be used by the caller to "boost" S.
*/
- margin = SCHED_LOAD_SCALE - signal;
- margin *= boost;
-
+ if (boost >= 0) {
+ margin = SCHED_LOAD_SCALE - signal;
+ margin *= boost;
+ } else
+ margin = -signal * boost;
/*
* Fast integer division by constant:
* Constant : (C) = 100
margin *= 1311;
margin >>= 17;
+ if (boost < 0)
+ margin *= -1;
return margin;
}
-static inline unsigned int
+static inline int
schedtune_cpu_margin(unsigned long util, int cpu)
{
- unsigned int boost;
+ int boost = schedtune_cpu_boost(cpu);
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- boost = schedtune_cpu_boost(cpu);
-#else
- boost = get_sysctl_sched_cfs_boost();
-#endif
if (boost == 0)
return 0;
return schedtune_margin(util, boost);
}
-static inline unsigned long
+static inline long
schedtune_task_margin(struct task_struct *task)
{
- unsigned int boost;
+ int boost = schedtune_task_boost(task);
unsigned long util;
- unsigned long margin;
+ long margin;
-#ifdef CONFIG_CGROUP_SCHEDTUNE
- boost = schedtune_task_boost(task);
-#else
- boost = get_sysctl_sched_cfs_boost();
-#endif
if (boost == 0)
return 0;
#else /* CONFIG_SCHED_TUNE */
-static inline unsigned int
+static inline int
schedtune_cpu_margin(unsigned long util, int cpu)
{
return 0;
}
-static inline unsigned int
+static inline int
schedtune_task_margin(struct task_struct *task)
{
return 0;
boosted_cpu_util(int cpu)
{
unsigned long util = cpu_util(cpu);
- unsigned long margin = schedtune_cpu_margin(util, cpu);
+ long margin = schedtune_cpu_margin(util, cpu);
trace_sched_boost_cpu(cpu, util, margin);
boosted_task_util(struct task_struct *task)
{
unsigned long util = task_util(task);
- unsigned long margin = schedtune_task_margin(task);
+ long margin = schedtune_task_margin(task);
+
+ trace_sched_boost_task(task, util, margin);
return util + margin;
}
struct sched_domain *sd;
struct sched_group *sg;
int i = task_cpu(p);
+ int best_idle = -1;
+ int best_idle_cstate = -1;
+ int best_idle_capacity = INT_MAX;
- if (idle_cpu(target))
- return target;
+ if (!sysctl_sched_cstate_aware) {
+ if (idle_cpu(target))
+ return target;
- /*
- * If the prevous cpu is cache affine and idle, don't be stupid.
- */
- if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
- return i;
+ /*
+ * If the prevous cpu is cache affine and idle, don't be stupid.
+ */
+ if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+ return i;
+ }
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
tsk_cpus_allowed(p)))
goto next;
- for_each_cpu(i, sched_group_cpus(sg)) {
- if (i == target || !idle_cpu(i))
- goto next;
- }
+ if (sysctl_sched_cstate_aware) {
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+ struct rq *rq = cpu_rq(i);
+ int idle_idx = idle_get_state_idx(rq);
+ unsigned long new_usage = boosted_task_util(p);
+ unsigned long capacity_orig = capacity_orig_of(i);
+ if (new_usage > capacity_orig || !idle_cpu(i))
+ goto next;
+
+ if (i == target && new_usage <= capacity_curr_of(target))
+ return target;
+
+ if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+ best_idle = i;
+ best_idle_cstate = idle_idx;
+ best_idle_capacity = capacity_orig;
+ }
+ }
+ } else {
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ if (i == target || !idle_cpu(i))
+ goto next;
+ }
- target = cpumask_first_and(sched_group_cpus(sg),
+ target = cpumask_first_and(sched_group_cpus(sg),
tsk_cpus_allowed(p));
- goto done;
+ goto done;
+ }
next:
sg = sg->next;
} while (sg != sd->groups);
}
+ if (best_idle > 0)
+ target = best_idle;
+
done:
return target;
}
-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+{
+ int iter_cpu;
+ int target_cpu = -1;
+ int target_util = 0;
+ int backup_capacity = 0;
+ int best_idle_cpu = -1;
+ int best_idle_cstate = INT_MAX;
+ int backup_cpu = -1;
+ unsigned long task_util_boosted, new_util;
+
+ task_util_boosted = boosted_task_util(p);
+ for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+ int cur_capacity;
+ struct rq *rq;
+ int idle_idx;
+
+ /*
+ * Iterate from higher cpus for boosted tasks.
+ */
+ int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+ if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+ continue;
+
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
+
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
+
+#ifdef CONFIG_SCHED_WALT
+ if (walt_cpu_high_irqload(i))
+ continue;
+#endif
+ /*
+ * Unconditionally favoring tasks that prefer idle cpus to
+ * improve latency.
+ */
+ if (idle_cpu(i) && prefer_idle) {
+ if (best_idle_cpu < 0)
+ best_idle_cpu = i;
+ continue;
+ }
+
+ cur_capacity = capacity_curr_of(i);
+ rq = cpu_rq(i);
+ idle_idx = idle_get_state_idx(rq);
+
+ if (new_util < cur_capacity) {
+ if (cpu_rq(i)->nr_running) {
+ if (prefer_idle) {
+ /* Find a target cpu with highest
+ * utilization.
+ */
+ if (target_util == 0 ||
+ target_util < new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ } else {
+ /* Find a target cpu with lowest
+ * utilization.
+ */
+ if (target_util == 0 ||
+ target_util > new_util) {
+ target_cpu = i;
+ target_util = new_util;
+ }
+ }
+ } else if (!prefer_idle) {
+ if (best_idle_cpu < 0 ||
+ (sysctl_sched_cstate_aware &&
+ best_idle_cstate > idle_idx)) {
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ }
+ }
+ } else if (backup_capacity == 0 ||
+ backup_capacity > cur_capacity) {
+ // Find a backup cpu with least capacity.
+ backup_capacity = cur_capacity;
+ backup_cpu = i;
+ }
+ }
+
+ if (prefer_idle && best_idle_cpu >= 0)
+ target_cpu = best_idle_cpu;
+ else if (target_cpu < 0)
+ target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+ return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
{
struct sched_domain *sd;
struct sched_group *sg, *sg_target;
int target_max_cap = INT_MAX;
int target_cpu = task_cpu(p);
+ unsigned long task_util_boosted, new_util;
int i;
+ if (sysctl_sched_sync_hint_enable && sync) {
+ int cpu = smp_processor_id();
+ cpumask_t search_cpus;
+ cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+ if (cpumask_test_cpu(cpu, &search_cpus))
+ return cpu;
+ }
+
sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
if (!sd)
sg = sd->groups;
sg_target = sg;
- /*
- * Find group with sufficient capacity. We only get here if no cpu is
- * overutilized. We may end up overutilizing a cpu by adding the task,
- * but that should not be any worse than select_idle_sibling().
- * load_balance() should sort it out later as we get above the tipping
- * point.
- */
- do {
- /* Assuming all cpus are the same in group */
- int max_cap_cpu = group_first_cpu(sg);
+ if (sysctl_sched_is_big_little) {
/*
- * Assume smaller max capacity means more energy-efficient.
- * Ideally we should query the energy model for the right
- * answer but it easily ends up in an exhaustive search.
+ * Find group with sufficient capacity. We only get here if no cpu is
+ * overutilized. We may end up overutilizing a cpu by adding the task,
+ * but that should not be any worse than select_idle_sibling().
+ * load_balance() should sort it out later as we get above the tipping
+ * point.
*/
- if (capacity_of(max_cap_cpu) < target_max_cap &&
- task_fits_max(p, max_cap_cpu)) {
- sg_target = sg;
- target_max_cap = capacity_of(max_cap_cpu);
- }
- } while (sg = sg->next, sg != sd->groups);
+ do {
+ /* Assuming all cpus are the same in group */
+ int max_cap_cpu = group_first_cpu(sg);
- /* Find cpu with sufficient capacity */
- for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
- /*
- * p's blocked utilization is still accounted for on prev_cpu
- * so prev_cpu will receive a negative bias due to the double
- * accounting. However, the blocked utilization may be zero.
- */
- int new_util = cpu_util(i) + boosted_task_util(p);
+ /*
+ * Assume smaller max capacity means more energy-efficient.
+ * Ideally we should query the energy model for the right
+ * answer but it easily ends up in an exhaustive search.
+ */
+ if (capacity_of(max_cap_cpu) < target_max_cap &&
+ task_fits_max(p, max_cap_cpu)) {
+ sg_target = sg;
+ target_max_cap = capacity_of(max_cap_cpu);
+ }
+ } while (sg = sg->next, sg != sd->groups);
- if (new_util > capacity_orig_of(i))
- continue;
+ task_util_boosted = boosted_task_util(p);
+ /* Find cpu with sufficient capacity */
+ for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+ /*
+ * p's blocked utilization is still accounted for on prev_cpu
+ * so prev_cpu will receive a negative bias due to the double
+ * accounting. However, the blocked utilization may be zero.
+ */
+ new_util = cpu_util(i) + task_util_boosted;
- if (new_util < capacity_curr_of(i)) {
- target_cpu = i;
- if (cpu_rq(i)->nr_running)
- break;
- }
+ /*
+ * Ensure minimum capacity to grant the required boost.
+ * The target CPU can be already at a capacity level higher
+ * than the one required to boost the task.
+ */
+ if (new_util > capacity_orig_of(i))
+ continue;
- /* cpu has capacity at higher OPP, keep it as fallback */
- if (target_cpu == task_cpu(p))
- target_cpu = i;
+ if (new_util < capacity_curr_of(i)) {
+ target_cpu = i;
+ if (cpu_rq(i)->nr_running)
+ break;
+ }
+
+ /* cpu has capacity at higher OPP, keep it as fallback */
+ if (target_cpu == task_cpu(p))
+ target_cpu = i;
+ }
+ } else {
+ /*
+ * Find a cpu with sufficient capacity
+ */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ bool boosted = schedtune_task_boost(p) > 0;
+ bool prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+ bool boosted = 0;
+ bool prefer_idle = 0;
+#endif
+ int tmp_target = find_best_target(p, boosted, prefer_idle);
+ if (tmp_target >= 0) {
+ target_cpu = tmp_target;
+ if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+ return target_cpu;
+ }
}
if (target_cpu != task_cpu(p)) {
if (!sd) {
if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
- new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+ new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
new_cpu = select_idle_sibling(p, new_cpu);
{
remove_entity_load_avg(&p->se);
}
+#else
+#define task_fits_max(p, cpu) true
#endif /* CONFIG_SMP */
static unsigned long
deactivate_task(env->src_rq, p, 0);
p->on_rq = TASK_ON_RQ_MIGRATING;
+ double_lock_balance(env->src_rq, env->dst_rq);
set_task_cpu(p, env->dst_cpu);
+ double_unlock_balance(env->src_rq, env->dst_rq);
}
/*
mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
raw_spin_unlock_irqrestore(&mcc->lock, flags);
- //pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+/*
+ printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+ cpu, capacity);
+*/
goto skip_unlock;
#endif
}
bool *overload, bool *overutilized)
{
unsigned long load;
- int i;
+ int i, nr_running;
memset(sgs, 0, sizeof(*sgs));
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
- if (rq->nr_running > 1)
+ nr_running = rq->nr_running;
+ if (nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
- if (idle_cpu(i))
+ /*
+ * No need to call idle_cpu() if nr_running is not 0
+ */
+ if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
if (cpu_overutilized(i)) {
env->dst_rq->rd->overload = overload;
/* Update over-utilization (tipping point, U >= 0) indicator */
- if (env->dst_rq->rd->overutilized != overutilized)
+ if (env->dst_rq->rd->overutilized != overutilized) {
env->dst_rq->rd->overutilized = overutilized;
+ trace_sched_overutilized(overutilized);
+ }
} else {
- if (!env->dst_rq->rd->overutilized && overutilized)
+ if (!env->dst_rq->rd->overutilized && overutilized) {
env->dst_rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
}
+
}
/**
struct sched_domain *sd;
int pulled_task = 0;
u64 curr_cost = 0;
+ long removed_util=0;
idle_enter_fair(this_rq);
raw_spin_unlock(&this_rq->lock);
+ /*
+ * If removed_util_avg is !0 we most probably migrated some task away
+ * from this_cpu. In this case we might be willing to trigger an OPP
+ * update, but we want to do so if we don't find anybody else to pull
+ * here (we will trigger an OPP update with the pulled task's enqueue
+ * anyway).
+ *
+ * Record removed_util before calling update_blocked_averages, and use
+ * it below (before returning) to see if an OPP update is required.
+ */
+ removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
update_blocked_averages(this_cpu);
rcu_read_lock();
for_each_domain(this_cpu, sd) {
if (pulled_task) {
idle_exit_fair(this_rq);
this_rq->idle_stamp = 0;
+ } else if (removed_util) {
+ /*
+ * No task pulled and someone has been migrated away.
+ * Good case to trigger an OPP update.
+ */
+ update_capacity_of(this_cpu);
}
return pulled_task;
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
- if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+#ifdef CONFIG_SMP
+ if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
rq->rd->overutilized = true;
+ trace_sched_overutilized(true);
+ }
rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
}
/*