*/
#include <linux/cpufreq.h>
#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#ifdef CONFIG_SCHED_HMP
+#include <linux/cpuidle.h>
+#endif
#include "sched.h"
if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
continue;
+ /*
+ * Skip inaccessible VMAs to avoid any confusion between
+ * PROT_NONE and NUMA hinting ptes
+ */
+ if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+ continue;
+
do {
start = max(start, vma->vm_start);
end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
*/
update_entity_load_avg(curr, 1);
update_cfs_rq_blocked_load(cfs_rq, 1);
+ update_cfs_shares(cfs_rq);
#ifdef CONFIG_SCHED_HRTICK
/*
return static_key_false(&__cfs_bandwidth_used);
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
{
- /* only need to count groups transitioning between enabled/!enabled */
- if (enabled && !was_enabled)
- static_key_slow_inc(&__cfs_bandwidth_used);
- else if (!enabled && was_enabled)
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+ static_key_slow_dec(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
return true;
}
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
#endif /* HAVE_JUMP_LABEL */
/*
cfs_rq->throttled_clock = rq->clock;
raw_spin_lock(&cfs_b->lock);
list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (!cfs_b->timer_active)
+ __start_cfs_bandwidth(cfs_b);
raw_spin_unlock(&cfs_b->lock);
}
if (idle)
goto out_unlock;
+ /*
+ * if we have relooped after returning idle once, we need to update our
+ * status as actually running, so that other cpus doing
+ * __start_cfs_bandwidth will stop trying to cancel us.
+ */
+ cfs_b->timer_active = 1;
+
__refill_cfs_bandwidth_runtime(cfs_b);
if (!throttled) {
/* how long we wait to gather additional slack before distributing */
static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
u64 expires;
/* confirm we're still not at a refresh boundary */
- if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+ raw_spin_lock(&cfs_b->lock);
+ if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+ raw_spin_unlock(&cfs_b->lock);
return;
+ }
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
runtime = cfs_b->runtime;
cfs_b->runtime = 0;
* (timer_active==0 becomes visible before the hrtimer call-back
* terminates). In either case we ensure that it's re-programmed
*/
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+ while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+ hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+ /* bounce the lock to allow do_sched_cfs_period_timer to run */
raw_spin_unlock(&cfs_b->lock);
- /* ensure cfs_b->lock is available while we wait */
- hrtimer_cancel(&cfs_b->period_timer);
-
+ cpu_relax();
raw_spin_lock(&cfs_b->lock);
/* if someone else restarted the timer then we're done */
if (cfs_b->timer_active)
extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+#ifdef CONFIG_CPU_IDLE
+/*
+ * hmp_idle_pull:
+ *
+ * In this version we have stopped using forced up migrations when we
+ * detect that a task running on a little CPU should be moved to a bigger
+ * CPU. In most cases, the bigger CPU is in a deep sleep state and a forced
+ * migration means we stop the task immediately but need to wait for the
+ * target CPU to wake up before we can restart the task which is being
+ * moved. Instead, we now wake a big CPU with an IPI and ask it to pull
+ * a task when ready. This allows the task to continue executing on its
+ * current CPU, reducing the amount of time that the task is stalled for.
+ *
+ * keepalive timers:
+ *
+ * The keepalive timer is used as a way to keep a CPU engaged in an
+ * idle pull operation out of idle while waiting for the source
+ * CPU to stop and move the task. Ideally this would not be necessary
+ * and we could impose a temporary zero-latency requirement on the
+ * current CPU, but in the current QoS framework this will result in
+ * all CPUs in the system being unable to enter idle states which is
+ * not desirable. The timer does not perform any work when it expires.
+ */
+struct hmp_keepalive {
+ bool init;
+ ktime_t delay; /* if zero, no need for timer */
+ struct hrtimer timer;
+};
+DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+
+/* setup per-cpu keepalive timers */
+static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer)
+{
+ return HRTIMER_NORESTART;
+}
+
+/*
+ * Work out if any of the idle states have an exit latency too high for us.
+ * ns_delay is passed in containing the max we are willing to tolerate.
+ * If there are none, set ns_delay to zero.
+ * If there are any, set ns_delay to
+ * ('target_residency of state with shortest too-big latency' - 1) * 1000.
+ */
+static void hmp_keepalive_delay(int cpu, unsigned int *ns_delay)
+{
+ struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+ struct cpuidle_driver *drv;
+
+ drv = cpuidle_get_cpu_driver(dev);
+ if (drv) {
+ unsigned int us_delay = UINT_MAX;
+ unsigned int us_max_delay = *ns_delay / 1000;
+ int idx;
+ /* if cpuidle states are guaranteed to be sorted we
+ * could stop at the first match.
+ */
+ for (idx = 0; idx < drv->state_count; idx++) {
+ if (drv->states[idx].exit_latency > us_max_delay &&
+ drv->states[idx].target_residency < us_delay) {
+ us_delay = drv->states[idx].target_residency;
+ }
+ }
+ if (us_delay == UINT_MAX)
+ *ns_delay = 0; /* no timer required */
+ else
+ *ns_delay = 1000 * (us_delay - 1);
+ }
+}
+
+static void hmp_cpu_keepalive_trigger(void)
+{
+ int cpu = smp_processor_id();
+ struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
+ if (!keepalive->init) {
+ unsigned int ns_delay = 100000; /* tolerate 100usec delay */
+
+ hrtimer_init(&keepalive->timer,
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+ keepalive->timer.function = hmp_cpu_keepalive_notify;
+
+ hmp_keepalive_delay(cpu, &ns_delay);
+ keepalive->delay = ns_to_ktime(ns_delay);
+ keepalive->init = true;
+ }
+ if (ktime_to_ns(keepalive->delay))
+ hrtimer_start(&keepalive->timer,
+ keepalive->delay, HRTIMER_MODE_REL_PINNED);
+}
+
+static void hmp_cpu_keepalive_cancel(int cpu)
+{
+ struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
+ if (keepalive->init)
+ hrtimer_cancel(&keepalive->timer);
+}
+#else /* !CONFIG_CPU_IDLE */
+static void hmp_cpu_keepalive_trigger(void)
+{
+}
+
+static void hmp_cpu_keepalive_cancel(int cpu)
+{
+}
+#endif
+
/* Setup hmp_domains */
static int __init hmp_cpu_mask_setup(void)
{
if(domain)
cpumask_clear_cpu(cpu, &domain->cpus);
+
+ hmp_cpu_keepalive_cancel(cpu);
}
/*
* Needed to determine heaviest tasks etc.
/* must hold runqueue lock for queue se is currently on */
static struct sched_entity *hmp_get_heaviest_task(
- struct sched_entity *se, int migrate_up)
+ struct sched_entity *se, int target_cpu)
{
int num_tasks = hmp_max_tasks;
struct sched_entity *max_se = se;
unsigned long int max_ratio = se->avg.load_avg_ratio;
const struct cpumask *hmp_target_mask = NULL;
+ struct hmp_domain *hmp;
- if (migrate_up) {
- struct hmp_domain *hmp;
- if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
- return max_se;
+ if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
+ return max_se;
- hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
- hmp_target_mask = &hmp->cpus;
+ hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
+ hmp_target_mask = &hmp->cpus;
+ if (target_cpu >= 0) {
+ /* idle_balance gets run on a CPU while
+ * it is in the middle of being hotplugged
+ * out. Bail early in that case.
+ */
+ if(!cpumask_test_cpu(target_cpu, hmp_target_mask))
+ return NULL;
+ hmp_target_mask = cpumask_of(target_cpu);
}
/* The currently running task is not on the runqueue */
se = __pick_first_entity(cfs_rq_of(se));
while (num_tasks && se) {
if (entity_is_task(se) &&
- (se->avg.load_avg_ratio > max_ratio &&
- hmp_target_mask &&
- cpumask_intersects(hmp_target_mask,
- tsk_cpus_allowed(task_of(se))))) {
+ se->avg.load_avg_ratio > max_ratio &&
+ cpumask_intersects(hmp_target_mask,
+ tsk_cpus_allowed(task_of(se)))) {
max_se = se;
max_ratio = se->avg.load_avg_ratio;
}
#else
new_cpu = hmp_select_slower_cpu(p, prev_cpu);
#endif
- if (new_cpu != prev_cpu) {
+ /*
+ * we might have no suitable CPU
+ * in which case new_cpu == NR_CPUS
+ */
+ if (new_cpu < NR_CPUS && new_cpu != prev_cpu) {
hmp_next_down_delay(&p->se, new_cpu);
trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
return new_cpu;
target = cpu_rq(cpu);
raw_spin_lock_irqsave(&target->lock, flags);
curr = target->cfs.curr;
- if (!curr) {
+ if (!curr || target->active_balance) {
raw_spin_unlock_irqrestore(&target->lock, flags);
continue;
}
}
}
orig = curr;
- curr = hmp_get_heaviest_task(curr, 1);
+ curr = hmp_get_heaviest_task(curr, -1);
+ if (!curr) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
p = task_of(curr);
if (hmp_up_migration(cpu, &target_cpu, curr)) {
- if (!target->active_balance) {
- get_task_struct(p);
- target->push_cpu = target_cpu;
- target->migrate_task = p;
- got_target = 1;
- trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
- hmp_next_up_delay(&p->se, target->push_cpu);
- }
+ cpu_rq(target_cpu)->wake_for_idle_pull = 1;
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ spin_unlock(&hmp_force_migration);
+ smp_send_reschedule(target_cpu);
+ return;
}
- if (!got_target && !target->active_balance) {
+ if (!got_target) {
/*
* For now we just check the currently running task.
* Selecting the lightest task for offloading will
* is not currently running move it, otherwise let the
* CPU stopper take care of it.
*/
- if (got_target && !target->active_balance) {
+ if (got_target) {
if (!task_running(target, p)) {
trace_sched_hmp_migrate_force_running(p, 0);
hmp_migrate_runnable_task(target);
}
}
orig = curr;
- curr = hmp_get_heaviest_task(curr, 1);
+ curr = hmp_get_heaviest_task(curr, this_cpu);
/* check if heaviest eligible task on this
* CPU is heavier than previous task
*/
- if (hmp_task_eligible_for_up_migration(curr) &&
- curr->avg.load_avg_ratio > ratio) {
+ if (curr && hmp_task_eligible_for_up_migration(curr) &&
+ curr->avg.load_avg_ratio > ratio &&
+ cpumask_test_cpu(this_cpu,
+ tsk_cpus_allowed(task_of(curr)))) {
p = task_of(curr);
target = rq;
ratio = curr->avg.load_avg_ratio;
raw_spin_unlock_irqrestore(&target->lock, flags);
if (force) {
+ /* start timer to keep us awake */
+ hmp_cpu_keepalive_trigger();
stop_one_cpu_nowait(cpu_of(target),
hmp_active_task_migration_cpu_stop,
target, &target->active_balance_work);
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
+#ifdef CONFIG_SCHED_HMP
+ /* shortcut for hmp idle pull wakeups */
+ if (unlikely(this_rq->wake_for_idle_pull)) {
+ this_rq->wake_for_idle_pull = 0;
+ if (hmp_idle_pull(this_cpu)) {
+ /* break out unless running nohz idle as well */
+ if (idle != CPU_IDLE)
+ return;
+ }
+ }
+#endif
+
hmp_force_up_migration(this_cpu);
rebalance_domains(this_cpu, idle);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
- if (unlikely(task_cpu(p) != this_cpu)) {
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
- }
+ /*
+ * Not only the cpu but also the task_group of the parent might have
+ * been changed after parent->se.parent,cfs_rq were copied to
+ * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+ * of child point to valid ones.
+ */
+ rcu_read_lock();
+ __set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
update_curr(cfs_rq);
struct cfs_rq *cfs_rq = cfs_rq_of(se);
/*
- * Ensure the task's vruntime is normalized, so that when its
+ * Ensure the task's vruntime is normalized, so that when it's
* switched back to the fair class the enqueue_entity(.flags=0) will
* do the right thing.
*
- * If it was on_rq, then the dequeue_entity(.flags=0) will already
- * have normalized the vruntime, if it was !on_rq, then only when
+ * If it's on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it's !on_rq, then only when
* the task is sleeping will it still have non-normalized vruntime.
*/
- if (!se->on_rq && p->state != TASK_RUNNING) {
+ if (!p->on_rq && p->state != TASK_RUNNING) {
/*
* Fix up our vruntime so that the current sleep doesn't
* cause 'unlimited' sleep bonus.
se->cfs_rq = parent->my_q;
se->my_q = cfs_rq;
- update_load_set(&se->load, 0);
+ /* guarantee group entities always have weight */
+ update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
}