Merge remote-tracking branch 'lsk/v3.10/topic/gator' into linux-linaro-lsk

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 345865ec2857e805e3163d92f0e8b76375d08c96..97ed132c809a9c203567a7e28833757beab65aef 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -39,6 +39,9 @@
   */
  #include <linux/cpufreq.h>
  #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#ifdef CONFIG_SCHED_HMP
+#include <linux/cpuidle.h>
+#endif
  
  #include "sched.h"
  
@@ -945,6 +948,13 @@ void task_numa_work(struct callback_head *work)
                 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
                         continue;
  
+               /*
+                * Skip inaccessible VMAs to avoid any confusion between
+                * PROT_NONE and NUMA hinting ptes
+                */
+               if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
+                       continue;
+
                 do {
                         start = max(start, vma->vm_start);
                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
@@ -2170,6 +2180,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
          */
         update_entity_load_avg(curr, 1);
         update_cfs_rq_blocked_load(cfs_rq, 1);
+       update_cfs_shares(cfs_rq);
  
  #ifdef CONFIG_SCHED_HRTICK
         /*
@@ -2207,13 +2218,14 @@ static inline bool cfs_bandwidth_used(void)
         return static_key_false(&__cfs_bandwidth_used);
  }
  
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
  {
-       /* only need to count groups transitioning between enabled/!enabled */
-       if (enabled && !was_enabled)
-               static_key_slow_inc(&__cfs_bandwidth_used);
-       else if (!enabled && was_enabled)
-               static_key_slow_dec(&__cfs_bandwidth_used);
+       static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+       static_key_slow_dec(&__cfs_bandwidth_used);
  }
  #else /* HAVE_JUMP_LABEL */
  static bool cfs_bandwidth_used(void)
@@ -2221,7 +2233,8 @@ static bool cfs_bandwidth_used(void)
         return true;
  }
  
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
  #endif /* HAVE_JUMP_LABEL */
  
  /*
@@ -2473,6 +2486,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
         cfs_rq->throttled_clock = rq->clock;
         raw_spin_lock(&cfs_b->lock);
         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (!cfs_b->timer_active)
+               __start_cfs_bandwidth(cfs_b);
         raw_spin_unlock(&cfs_b->lock);
  }
  
@@ -2584,6 +2599,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
         if (idle)
                 goto out_unlock;
  
+       /*
+        * if we have relooped after returning idle once, we need to update our
+        * status as actually running, so that other cpus doing
+        * __start_cfs_bandwidth will stop trying to cancel us.
+        */
+       cfs_b->timer_active = 1;
+
         __refill_cfs_bandwidth_runtime(cfs_b);
  
         if (!throttled) {
@@ -2644,7 +2666,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
  /* how long we wait to gather additional slack before distributing */
  static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
  
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
  static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
  {
         struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2720,10 +2748,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         u64 expires;
  
         /* confirm we're still not at a refresh boundary */
-       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+       raw_spin_lock(&cfs_b->lock);
+       if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+               raw_spin_unlock(&cfs_b->lock);
                 return;
+       }
  
-       raw_spin_lock(&cfs_b->lock);
         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
                 runtime = cfs_b->runtime;
                 cfs_b->runtime = 0;
@@ -2848,11 +2878,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
          * (timer_active==0 becomes visible before the hrtimer call-back
          * terminates).  In either case we ensure that it's re-programmed
          */
-       while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+       while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+              hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+               /* bounce the lock to allow do_sched_cfs_period_timer to run */
                 raw_spin_unlock(&cfs_b->lock);
-               /* ensure cfs_b->lock is available while we wait */
-               hrtimer_cancel(&cfs_b->period_timer);
-
+               cpu_relax();
                 raw_spin_lock(&cfs_b->lock);
                 /* if someone else restarted the timer then we're done */
                 if (cfs_b->timer_active)
@@ -3514,6 +3544,111 @@ static const int hmp_max_tasks = 5;
  
  extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
  
+#ifdef CONFIG_CPU_IDLE
+/*
+ * hmp_idle_pull:
+ *
+ * In this version we have stopped using forced up migrations when we
+ * detect that a task running on a little CPU should be moved to a bigger
+ * CPU. In most cases, the bigger CPU is in a deep sleep state and a forced
+ * migration means we stop the task immediately but need to wait for the
+ * target CPU to wake up before we can restart the task which is being
+ * moved. Instead, we now wake a big CPU with an IPI and ask it to pull
+ * a task when ready. This allows the task to continue executing on its
+ * current CPU, reducing the amount of time that the task is stalled for.
+ *
+ * keepalive timers:
+ *
+ * The keepalive timer is used as a way to keep a CPU engaged in an
+ * idle pull operation out of idle while waiting for the source
+ * CPU to stop and move the task. Ideally this would not be necessary
+ * and we could impose a temporary zero-latency requirement on the
+ * current CPU, but in the current QoS framework this will result in
+ * all CPUs in the system being unable to enter idle states which is
+ * not desirable. The timer does not perform any work when it expires.
+ */
+struct hmp_keepalive {
+       bool init;
+       ktime_t delay;  /* if zero, no need for timer */
+       struct hrtimer timer;
+};
+DEFINE_PER_CPU(struct hmp_keepalive, hmp_cpu_keepalive);
+
+/* setup per-cpu keepalive timers */
+static enum hrtimer_restart hmp_cpu_keepalive_notify(struct hrtimer *hrtimer)
+{
+       return HRTIMER_NORESTART;
+}
+
+/*
+ * Work out if any of the idle states have an exit latency too high for us.
+ * ns_delay is passed in containing the max we are willing to tolerate.
+ * If there are none, set ns_delay to zero.
+ * If there are any, set ns_delay to
+ * ('target_residency of state with shortest too-big latency' - 1) * 1000.
+ */
+static void hmp_keepalive_delay(int cpu, unsigned int *ns_delay)
+{
+       struct cpuidle_device *dev = per_cpu(cpuidle_devices, cpu);
+       struct cpuidle_driver *drv;
+
+       drv = cpuidle_get_cpu_driver(dev);
+       if (drv) {
+               unsigned int us_delay = UINT_MAX;
+               unsigned int us_max_delay = *ns_delay / 1000;
+               int idx;
+               /* if cpuidle states are guaranteed to be sorted we
+                * could stop at the first match.
+                */
+               for (idx = 0; idx < drv->state_count; idx++) {
+                       if (drv->states[idx].exit_latency > us_max_delay &&
+                               drv->states[idx].target_residency < us_delay) {
+                               us_delay = drv->states[idx].target_residency;
+                       }
+               }
+               if (us_delay == UINT_MAX)
+                       *ns_delay = 0; /* no timer required */
+               else
+                       *ns_delay = 1000 * (us_delay - 1);
+       }
+}
+
+static void hmp_cpu_keepalive_trigger(void)
+{
+       int cpu = smp_processor_id();
+       struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
+       if (!keepalive->init) {
+               unsigned int ns_delay = 100000; /* tolerate 100usec delay */
+
+               hrtimer_init(&keepalive->timer,
+                               CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+               keepalive->timer.function = hmp_cpu_keepalive_notify;
+
+               hmp_keepalive_delay(cpu, &ns_delay);
+               keepalive->delay = ns_to_ktime(ns_delay);
+               keepalive->init = true;
+       }
+       if (ktime_to_ns(keepalive->delay))
+               hrtimer_start(&keepalive->timer,
+                       keepalive->delay, HRTIMER_MODE_REL_PINNED);
+}
+
+static void hmp_cpu_keepalive_cancel(int cpu)
+{
+       struct hmp_keepalive *keepalive = &per_cpu(hmp_cpu_keepalive, cpu);
+       if (keepalive->init)
+               hrtimer_cancel(&keepalive->timer);
+}
+#else /* !CONFIG_CPU_IDLE */
+static void hmp_cpu_keepalive_trigger(void)
+{
+}
+
+static void hmp_cpu_keepalive_cancel(int cpu)
+{
+}
+#endif
+
  /* Setup hmp_domains */
  static int __init hmp_cpu_mask_setup(void)
  {
@@ -3574,6 +3709,8 @@ static void hmp_offline_cpu(int cpu)
  
         if(domain)
                 cpumask_clear_cpu(cpu, &domain->cpus);
+
+       hmp_cpu_keepalive_cancel(cpu);
  }
  /*
   * Needed to determine heaviest tasks etc.
@@ -3585,30 +3722,36 @@ static inline struct hmp_domain *hmp_faster_domain(int cpu);
  
  /* must hold runqueue lock for queue se is currently on */
  static struct sched_entity *hmp_get_heaviest_task(
-                               struct sched_entity *se, int migrate_up)
+                               struct sched_entity *se, int target_cpu)
  {
         int num_tasks = hmp_max_tasks;
         struct sched_entity *max_se = se;
         unsigned long int max_ratio = se->avg.load_avg_ratio;
         const struct cpumask *hmp_target_mask = NULL;
+       struct hmp_domain *hmp;
  
-       if (migrate_up) {
-               struct hmp_domain *hmp;
-               if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
-                       return max_se;
+       if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
+               return max_se;
  
-               hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
-               hmp_target_mask = &hmp->cpus;
+       hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
+       hmp_target_mask = &hmp->cpus;
+       if (target_cpu >= 0) {
+               /* idle_balance gets run on a CPU while
+                * it is in the middle of being hotplugged
+                * out. Bail early in that case.
+                */
+               if(!cpumask_test_cpu(target_cpu, hmp_target_mask))
+                       return NULL;
+               hmp_target_mask = cpumask_of(target_cpu);
         }
         /* The currently running task is not on the runqueue */
         se = __pick_first_entity(cfs_rq_of(se));
  
         while (num_tasks && se) {
                 if (entity_is_task(se) &&
-                       (se->avg.load_avg_ratio > max_ratio &&
-                        hmp_target_mask &&
-                        cpumask_intersects(hmp_target_mask,
-                               tsk_cpus_allowed(task_of(se))))) {
+                       se->avg.load_avg_ratio > max_ratio &&
+                       cpumask_intersects(hmp_target_mask,
+                               tsk_cpus_allowed(task_of(se)))) {
                         max_se = se;
                         max_ratio = se->avg.load_avg_ratio;
                 }
@@ -4341,7 +4484,11 @@ unlock:
  #else
                 new_cpu = hmp_select_slower_cpu(p, prev_cpu);
  #endif
-               if (new_cpu != prev_cpu) {
+               /*
+                * we might have no suitable CPU
+                * in which case new_cpu == NR_CPUS
+                */
+               if (new_cpu < NR_CPUS && new_cpu != prev_cpu) {
                         hmp_next_down_delay(&p->se, new_cpu);
                         trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
                         return new_cpu;
@@ -7017,7 +7164,7 @@ static void hmp_force_up_migration(int this_cpu)
                 target = cpu_rq(cpu);
                 raw_spin_lock_irqsave(&target->lock, flags);
                 curr = target->cfs.curr;
-               if (!curr) {
+               if (!curr || target->active_balance) {
                         raw_spin_unlock_irqrestore(&target->lock, flags);
                         continue;
                 }
@@ -7031,19 +7178,20 @@ static void hmp_force_up_migration(int this_cpu)
                         }
                 }
                 orig = curr;
-               curr = hmp_get_heaviest_task(curr, 1);
+               curr = hmp_get_heaviest_task(curr, -1);
+               if (!curr) {
+                       raw_spin_unlock_irqrestore(&target->lock, flags);
+                       continue;
+               }
                 p = task_of(curr);
                 if (hmp_up_migration(cpu, &target_cpu, curr)) {
-                       if (!target->active_balance) {
-                               get_task_struct(p);
-                               target->push_cpu = target_cpu;
-                               target->migrate_task = p;
-                               got_target = 1;
-                               trace_sched_hmp_migrate(p, target->push_cpu, HMP_MIGRATE_FORCE);
-                               hmp_next_up_delay(&p->se, target->push_cpu);
-                       }
+                       cpu_rq(target_cpu)->wake_for_idle_pull = 1;
+                       raw_spin_unlock_irqrestore(&target->lock, flags);
+                       spin_unlock(&hmp_force_migration);
+                       smp_send_reschedule(target_cpu);
+                       return;
                 }
-               if (!got_target && !target->active_balance) {
+               if (!got_target) {
                         /*
                          * For now we just check the currently running task.
                          * Selecting the lightest task for offloading will
@@ -7065,7 +7213,7 @@ static void hmp_force_up_migration(int this_cpu)
                  * is not currently running move it, otherwise let the
                  * CPU stopper take care of it.
                  */
-               if (got_target && !target->active_balance) {
+               if (got_target) {
                         if (!task_running(target, p)) {
                                 trace_sched_hmp_migrate_force_running(p, 0);
                                 hmp_migrate_runnable_task(target);
@@ -7131,12 +7279,14 @@ static unsigned int hmp_idle_pull(int this_cpu)
                         }
                 }
                 orig = curr;
-               curr = hmp_get_heaviest_task(curr, 1);
+               curr = hmp_get_heaviest_task(curr, this_cpu);
                 /* check if heaviest eligible task on this
                  * CPU is heavier than previous task
                  */
-               if (hmp_task_eligible_for_up_migration(curr) &&
-                       curr->avg.load_avg_ratio > ratio) {
+               if (curr && hmp_task_eligible_for_up_migration(curr) &&
+                       curr->avg.load_avg_ratio > ratio &&
+                       cpumask_test_cpu(this_cpu,
+                                       tsk_cpus_allowed(task_of(curr)))) {
                         p = task_of(curr);
                         target = rq;
                         ratio = curr->avg.load_avg_ratio;
@@ -7171,6 +7321,8 @@ static unsigned int hmp_idle_pull(int this_cpu)
         raw_spin_unlock_irqrestore(&target->lock, flags);
  
         if (force) {
+               /* start timer to keep us awake */
+               hmp_cpu_keepalive_trigger();
                 stop_one_cpu_nowait(cpu_of(target),
                         hmp_active_task_migration_cpu_stop,
                         target, &target->active_balance_work);
@@ -7194,6 +7346,18 @@ static void run_rebalance_domains(struct softirq_action *h)
         enum cpu_idle_type idle = this_rq->idle_balance ?
                                                 CPU_IDLE : CPU_NOT_IDLE;
  
+#ifdef CONFIG_SCHED_HMP
+       /* shortcut for hmp idle pull wakeups */
+       if (unlikely(this_rq->wake_for_idle_pull)) {
+               this_rq->wake_for_idle_pull = 0;
+               if (hmp_idle_pull(this_cpu)) {
+                       /* break out unless running nohz idle as well */
+                       if (idle != CPU_IDLE)
+                               return;
+               }
+       }
+#endif
+
         hmp_force_up_migration(this_cpu);
  
         rebalance_domains(this_cpu, idle);
@@ -7286,11 +7450,15 @@ static void task_fork_fair(struct task_struct *p)
         cfs_rq = task_cfs_rq(current);
         curr = cfs_rq->curr;
  
-       if (unlikely(task_cpu(p) != this_cpu)) {
-               rcu_read_lock();
-               __set_task_cpu(p, this_cpu);
-               rcu_read_unlock();
-       }
+       /*
+        * Not only the cpu but also the task_group of the parent might have
+        * been changed after parent->se.parent,cfs_rq were copied to
+        * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
+        * of child point to valid ones.
+        */
+       rcu_read_lock();
+       __set_task_cpu(p, this_cpu);
+       rcu_read_unlock();
  
         update_curr(cfs_rq);
  
@@ -7340,15 +7508,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
  
         /*
-        * Ensure the task's vruntime is normalized, so that when its
+        * Ensure the task's vruntime is normalized, so that when it's
          * switched back to the fair class the enqueue_entity(.flags=0) will
          * do the right thing.
          *
-        * If it was on_rq, then the dequeue_entity(.flags=0) will already
-        * have normalized the vruntime, if it was !on_rq, then only when
+        * If it's on_rq, then the dequeue_entity(.flags=0) will already
+        * have normalized the vruntime, if it's !on_rq, then only when
          * the task is sleeping will it still have non-normalized vruntime.
          */
-       if (!se->on_rq && p->state != TASK_RUNNING) {
+       if (!p->on_rq && p->state != TASK_RUNNING) {
                 /*
                  * Fix up our vruntime so that the current sleep doesn't
                  * cause 'unlimited' sleep bonus.
@@ -7569,7 +7737,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                 se->cfs_rq = parent->my_q;
  
         se->my_q = cfs_rq;
-       update_load_set(&se->load, 0);
+       /* guarantee group entities always have weight */
+       update_load_set(&se->load, NICE_0_LOAD);
         se->parent = parent;
  }