*/
unsigned long nr_uninterruptible;
- struct task_struct *curr, *idle;
+ struct task_struct *curr, *idle, *stop;
unsigned long next_balance;
struct mm_struct *prev_mm;
u64 clock;
+ u64 clock_task;
atomic_t nr_iowait;
u64 avg_idle;
#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
#endif /* CONFIG_CGROUP_SCHED */
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
inline void update_rq_clock(struct rq *rq)
{
- if (!rq->skip_clock_update)
- rq->clock = sched_clock_cpu(cpu_of(rq));
+ if (!rq->skip_clock_update) {
+ int cpu = cpu_of(rq);
+ u64 irq_time;
+
+ rq->clock = sched_clock_cpu(cpu);
+ irq_time = irq_time_cpu(cpu);
+ if (rq->clock - irq_time > rq->clock_task)
+ rq->clock_task = rq->clock - irq_time;
+
+ sched_irq_time_avg_update(rq, irq_time);
+ }
}
/*
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
}
+
+static void sched_avg_update(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
static const struct sched_class rt_sched_class;
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
static void set_load_weight(struct task_struct *p)
{
- if (task_has_rt_policy(p)) {
- p->se.load.weight = 0;
- p->se.load.inv_weight = WMULT_CONST;
- return;
- }
-
/*
* SCHED_IDLE tasks get minimal weight:
*/
dec_nr_running(rq);
}
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+ if (!sched_clock_irqtime)
+ return 0;
+
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+ unsigned long flags;
+ int cpu;
+ u64 now, delta;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ now = sched_clock_cpu(cpu);
+ delta = now - per_cpu(irq_start_time, cpu);
+ per_cpu(irq_start_time, cpu) = now;
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ per_cpu(cpu_hardirq_time, cpu) += delta;
+ else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ per_cpu(cpu_softirq_time, cpu) += delta;
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+ if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+ u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+ rq->prev_irq_time = curr_irq_time;
+ sched_rt_avg_update(rq, delta_irq);
+ }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+ return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
#include "sched_idletask.c"
#include "sched_fair.c"
#include "sched_rt.c"
+#include "sched_stoptask.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+ struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+ struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+ if (stop) {
+ /*
+ * Make it appear like a SCHED_FIFO task, its something
+ * userspace knows about and won't get confused about.
+ *
+ * Also, it will make PI more or less work without too
+ * much confusion -- but then, stop work should not
+ * rely on PI working anyway.
+ */
+ sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
+
+ stop->sched_class = &stop_sched_class;
+ }
+
+ cpu_rq(cpu)->stop = stop;
+
+ if (old_stop) {
+ /*
+ * Reset it back to a normal scheduling class so that
+ * it can die in pieces.
+ */
+ old_stop->sched_class = &rt_sched_class;
+ }
+}
+
/*
* __normal_prio - return the priority that is based on the static prio
*/
if (p->sched_class != &fair_sched_class)
return 0;
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
*/
arch_start_context_switch(prev);
- if (likely(!mm)) {
+ if (!mm) {
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
switch_mm(oldmm, mm, next);
- if (likely(!prev->mm)) {
+ if (!prev->mm) {
prev->active_mm = NULL;
rq->prev_mm = oldmm;
}
this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
}
+
+ sched_avg_update(this_rq);
}
static void update_cpu_load_active(struct rq *this_rq)
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock - p->se.exec_start;
+ ns = rq->clock_task - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
cpustat->irq = cputime64_add(cpustat->irq, tmp);
- else if (softirq_count())
+ else if (in_serving_softirq())
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
else
cpustat->system = cputime64_add(cpustat->system, tmp);
rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
if (total) {
- u64 temp;
+ u64 temp = rtime;
- temp = (u64)(rtime * utime);
+ temp *= utime;
do_div(temp, total);
utime = (cputime_t)temp;
} else
rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
if (total) {
- u64 temp;
+ u64 temp = rtime;
- temp = (u64)(rtime * cputime.utime);
+ temp *= cputime.utime;
do_div(temp, total);
utime = (cputime_t)temp;
} else
curr->sched_class->task_tick(rq, curr, 0);
raw_spin_unlock(&rq->lock);
- perf_event_task_tick(curr);
+ perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
return p;
}
- class = sched_class_highest;
- for ( ; ; ) {
+ for_each_class(class) {
p = class->pick_next_task(rq);
if (p)
return p;
- /*
- * Will never be NULL as the idle class always
- * returns a non-NULL p:
- */
- class = class->next;
}
+
+ BUG(); /* the idle class will always have a runnable task */
}
/*
rq = task_rq_lock(p, &flags);
+ trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
on_rq = p->se.on_rq;
}
if (user) {
- retval = security_task_setscheduler(p, policy, param);
+ retval = security_task_setscheduler(p);
if (retval)
return retval;
}
*/
rq = __task_rq_lock(p);
+ /*
+ * Changing the policy of the stop threads its a very bad idea
+ */
+ if (p == rq->stop) {
+ __task_rq_unlock(rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ return -EINVAL;
+ }
+
#ifdef CONFIG_RT_GROUP_SCHED
if (user) {
/*
if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
goto out_unlock;
- retval = security_task_setscheduler(p, 0, NULL);
+ retval = security_task_setscheduler(p);
if (retval)
goto out_unlock;
cpuset_cpus_allowed(p, cpus_allowed);
cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
retval = set_cpus_allowed_ptr(p, new_mask);
if (!retval) {
idle->se.exec_start = sched_clock();
cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ /*
+ * We're having a chicken and egg problem, even though we are
+ * holding rq->lock, the cpu isn't yet set to this cpu so the
+ * lockdep check in task_group() will fail.
+ *
+ * Similar case to sched_fork(). / Alternatively we could
+ * use task_rq_lock() here and obtain the other rq->lock.
+ *
+ * Silence PROVE_RCU
+ */
+ rcu_read_lock();
__set_task_cpu(idle, cpu);
+ rcu_read_unlock();
rq->curr = rq->idle = idle;
#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
return 1;
- err_free_rq:
+err_free_rq:
kfree(cfs_rq);
- err:
+err:
return 0;
}
return 1;
- err_free_rq:
+err_free_rq:
kfree(rt_rq);
- err:
+err:
return 0;
}
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);