Merge branch 'x86-bios-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...

[firefly-linux-kernel-4.4.55.git] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index b40b82e33590d1cbf0f2ce30d602e255b519bc96..d42992bccdfae88569559f3e88f81bbcea8e9494 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -486,11 +486,12 @@ struct rq {
          */
         unsigned long nr_uninterruptible;
  
-       struct task_struct *curr, *idle;
+       struct task_struct *curr, *idle, *stop;
         unsigned long next_balance;
         struct mm_struct *prev_mm;
  
         u64 clock;
+       u64 clock_task;
  
         atomic_t nr_iowait;
  
@@ -518,6 +519,10 @@ struct rq {
         u64 avg_idle;
  #endif
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+       u64 prev_irq_time;
+#endif
+
         /* calc_load related fields */
         unsigned long calc_load_update;
         long calc_load_active;
@@ -641,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
  
  #endif /* CONFIG_CGROUP_SCHED */
  
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
  inline void update_rq_clock(struct rq *rq)
  {
-       if (!rq->skip_clock_update)
-               rq->clock = sched_clock_cpu(cpu_of(rq));
+       if (!rq->skip_clock_update) {
+               int cpu = cpu_of(rq);
+               u64 irq_time;
+
+               rq->clock = sched_clock_cpu(cpu);
+               irq_time = irq_time_cpu(cpu);
+               if (rq->clock - irq_time > rq->clock_task)
+                       rq->clock_task = rq->clock - irq_time;
+
+               sched_irq_time_avg_update(rq, irq_time);
+       }
  }
  
  /*
@@ -1291,6 +1308,10 @@ static void resched_task(struct task_struct *p)
  static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
  {
  }
+
+static void sched_avg_update(struct rq *rq)
+{
+}
  #endif /* CONFIG_SMP */
  
  #if BITS_PER_LONG == 32
@@ -1833,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
  
  static const struct sched_class rt_sched_class;
  
-#define sched_class_highest (&rt_sched_class)
+#define sched_class_highest (&stop_sched_class)
  #define for_each_class(class) \
     for (class = sched_class_highest; class; class = class->next)
  
@@ -1851,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
  
  static void set_load_weight(struct task_struct *p)
  {
-       if (task_has_rt_policy(p)) {
-               p->se.load.weight = 0;
-               p->se.load.inv_weight = WMULT_CONST;
-               return;
-       }
-
         /*
          * SCHED_IDLE tasks get minimal weight:
          */
@@ -1910,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
         dec_nr_running(rq);
  }
  
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+       sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+       if (!sched_clock_irqtime)
+               return 0;
+
+       return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+       unsigned long flags;
+       int cpu;
+       u64 now, delta;
+
+       if (!sched_clock_irqtime)
+               return;
+
+       local_irq_save(flags);
+
+       cpu = smp_processor_id();
+       now = sched_clock_cpu(cpu);
+       delta = now - per_cpu(irq_start_time, cpu);
+       per_cpu(irq_start_time, cpu) = now;
+       /*
+        * We do not account for softirq time from ksoftirqd here.
+        * We want to continue accounting softirq time to ksoftirqd thread
+        * in that case, so as not to confuse scheduler with a special task
+        * that do not consume any time, but still wants to run.
+        */
+       if (hardirq_count())
+               per_cpu(cpu_hardirq_time, cpu) += delta;
+       else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+               per_cpu(cpu_softirq_time, cpu) += delta;
+
+       local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+       if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+               u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+               rq->prev_irq_time = curr_irq_time;
+               sched_rt_avg_update(rq, delta_irq);
+       }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+       return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
  #include "sched_idletask.c"
  #include "sched_fair.c"
  #include "sched_rt.c"
+#include "sched_stoptask.c"
  #ifdef CONFIG_SCHED_DEBUG
  # include "sched_debug.c"
  #endif
  
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+       struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
+       struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+       if (stop) {
+               /*
+                * Make it appear like a SCHED_FIFO task, its something
+                * userspace knows about and won't get confused about.
+                *
+                * Also, it will make PI more or less work without too
+                * much confusion -- but then, stop work should not
+                * rely on PI working anyway.
+                */
+               sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
+
+               stop->sched_class = &stop_sched_class;
+       }
+
+       cpu_rq(cpu)->stop = stop;
+
+       if (old_stop) {
+               /*
+                * Reset it back to a normal scheduling class so that
+                * it can die in pieces.
+                */
+               old_stop->sched_class = &rt_sched_class;
+       }
+}
+
  /*
   * __normal_prio - return the priority that is based on the static prio
   */
@@ -1996,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
         if (p->sched_class != &fair_sched_class)
                 return 0;
  
+       if (unlikely(p->policy == SCHED_IDLE))
+               return 0;
+
         /*
          * Buddy candidates are cache hot:
          */
@@ -2845,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
          */
         arch_start_context_switch(prev);
  
-       if (likely(!mm)) {
+       if (!mm) {
                 next->active_mm = oldmm;
                 atomic_inc(&oldmm->mm_count);
                 enter_lazy_tlb(oldmm, next);
         } else
                 switch_mm(oldmm, mm, next);
  
-       if (likely(!prev->mm)) {
+       if (!prev->mm) {
                 prev->active_mm = NULL;
                 rq->prev_mm = oldmm;
         }
@@ -3179,6 +3316,8 @@ static void update_cpu_load(struct rq *this_rq)
  
                 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
         }
+
+       sched_avg_update(this_rq);
  }
  
  static void update_cpu_load_active(struct rq *this_rq)
@@ -3239,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
  
         if (task_current(rq, p)) {
                 update_rq_clock(rq);
-               ns = rq->clock - p->se.exec_start;
+               ns = rq->clock_task - p->se.exec_start;
                 if ((s64)ns < 0)
                         ns = 0;
         }
@@ -3388,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
         tmp = cputime_to_cputime64(cputime);
         if (hardirq_count() - hardirq_offset)
                 cpustat->irq = cputime64_add(cpustat->irq, tmp);
-       else if (softirq_count())
+       else if (in_serving_softirq())
                 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3504,9 +3643,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
  
         if (total) {
-               u64 temp;
+               u64 temp = rtime;
  
-               temp = (u64)(rtime * utime);
+               temp *= utime;
                 do_div(temp, total);
                 utime = (cputime_t)temp;
         } else
@@ -3537,9 +3676,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
  
         if (total) {
-               u64 temp;
+               u64 temp = rtime;
  
-               temp = (u64)(rtime * cputime.utime);
+               temp *= cputime.utime;
                 do_div(temp, total);
                 utime = (cputime_t)temp;
         } else
@@ -3575,7 +3714,7 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
  
-       perf_event_task_tick(curr);
+       perf_event_task_tick();
  
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@ -3714,17 +3853,13 @@ pick_next_task(struct rq *rq)
                         return p;
         }
  
-       class = sched_class_highest;
-       for ( ; ; ) {
+       for_each_class(class) {
                 p = class->pick_next_task(rq);
                 if (p)
                         return p;
-               /*
-                * Will never be NULL as the idle class always
-                * returns a non-NULL p:
-                */
-               class = class->next;
         }
+
+       BUG(); /* the idle class will always have a runnable task */
  }
  
  /*
@@ -4349,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
  
         rq = task_rq_lock(p, &flags);
  
+       trace_sched_pi_setprio(p, prio);
         oldprio = p->prio;
         prev_class = p->sched_class;
         on_rq = p->se.on_rq;
@@ -4636,7 +4772,7 @@ recheck:
         }
  
         if (user) {
-               retval = security_task_setscheduler(p, policy, param);
+               retval = security_task_setscheduler(p);
                 if (retval)
                         return retval;
         }
@@ -4652,6 +4788,15 @@ recheck:
          */
         rq = __task_rq_lock(p);
  
+       /*
+        * Changing the policy of the stop threads its a very bad idea
+        */
+       if (p == rq->stop) {
+               __task_rq_unlock(rq);
+               raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+               return -EINVAL;
+       }
+
  #ifdef CONFIG_RT_GROUP_SCHED
         if (user) {
                 /*
@@ -4878,13 +5023,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
-       retval = security_task_setscheduler(p, 0, NULL);
+       retval = security_task_setscheduler(p);
         if (retval)
                 goto out_unlock;
  
         cpuset_cpus_allowed(p, cpus_allowed);
         cpumask_and(new_mask, in_mask, cpus_allowed);
- again:
+again:
         retval = set_cpus_allowed_ptr(p, new_mask);
  
         if (!retval) {
@@ -5328,7 +5473,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         idle->se.exec_start = sched_clock();
  
         cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+       /*
+        * We're having a chicken and egg problem, even though we are
+        * holding rq->lock, the cpu isn't yet set to this cpu so the
+        * lockdep check in task_group() will fail.
+        *
+        * Similar case to sched_fork(). / Alternatively we could
+        * use task_rq_lock() here and obtain the other rq->lock.
+        *
+        * Silence PROVE_RCU
+        */
+       rcu_read_lock();
         __set_task_cpu(idle, cpu);
+       rcu_read_unlock();
  
         rq->curr = rq->idle = idle;
  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -8134,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
  
         return 1;
  
- err_free_rq:
+err_free_rq:
         kfree(cfs_rq);
- err:
+err:
         return 0;
  }
  
@@ -8224,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  
         return 1;
  
- err_free_rq:
+err_free_rq:
         kfree(rt_rq);
- err:
+err:
         return 0;
  }
  
@@ -8584,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
         }
         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- unlock:
+unlock:
         read_unlock(&tasklist_lock);
         mutex_unlock(&rt_constraints_mutex);