Merge branch 'linux-linaro-lsk-v4.4-android' of git://git.linaro.org/kernel/linux...

[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index a45a6e1a692c9a6e1f1823a7ce46c7cbb58accce..f03f54704c6d6036ec7f73b065c5675a32dbbb2b 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1207,8 +1207,6 @@ static void task_numa_assign(struct task_numa_env *env,
  {
         if (env->best_task)
                 put_task_struct(env->best_task);
-       if (p)
-               get_task_struct(p);
  
         env->best_task = p;
         env->best_imp = imp;
@@ -1276,20 +1274,30 @@ static void task_numa_compare(struct task_numa_env *env,
         long imp = env->p->numa_group ? groupimp : taskimp;
         long moveimp = imp;
         int dist = env->dist;
+       bool assigned = false;
  
         rcu_read_lock();
  
         raw_spin_lock_irq(&dst_rq->lock);
         cur = dst_rq->curr;
         /*
-        * No need to move the exiting task, and this ensures that ->curr
-        * wasn't reaped and thus get_task_struct() in task_numa_assign()
-        * is safe under RCU read lock.
-        * Note that rcu_read_lock() itself can't protect from the final
-        * put_task_struct() after the last schedule().
+        * No need to move the exiting task or idle task.
          */
         if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                 cur = NULL;
+       else {
+               /*
+                * The task_struct must be protected here to protect the
+                * p->numa_faults access in the task_weight since the
+                * numa_faults could already be freed in the following path:
+                * finish_task_switch()
+                *     --> put_task_struct()
+                *         --> __put_task_struct()
+                *             --> task_numa_free()
+                */
+               get_task_struct(cur);
+       }
+
         raw_spin_unlock_irq(&dst_rq->lock);
  
         /*
@@ -1373,6 +1381,7 @@ balance:
                  */
                 if (!load_too_imbalanced(src_load, dst_load, env)) {
                         imp = moveimp - 1;
+                       put_task_struct(cur);
                         cur = NULL;
                         goto assign;
                 }
@@ -1398,9 +1407,16 @@ balance:
                 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
  
  assign:
+       assigned = true;
         task_numa_assign(env, cur, imp);
  unlock:
         rcu_read_unlock();
+       /*
+        * The dst_rq->curr isn't assigned. The protection for task_struct is
+        * finished.
+        */
+       if (cur && !assigned)
+               put_task_struct(cur);
  }
  
  static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2741,6 +2757,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         cfs_rq->load_last_update_time_copy = sa->last_update_time;
  #endif
  
+       /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+       if (cfs_rq == &rq_of(cfs_rq)->cfs)
+               trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
         return decayed || removed;
  }
  
@@ -2764,7 +2784,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
  
         if (entity_is_task(se))
                 trace_sched_load_avg_task(task_of(se), &se->avg);
-       trace_sched_load_avg_cpu(cpu, cfs_rq);
  }
  
  static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4255,11 +4274,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
  #ifdef CONFIG_SMP
  
+       /*
+        * Update SchedTune accounting.
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
         if (!se) {
                 walt_inc_cumulative_runnable_avg(rq, p);
                 if (!task_new && !rq->rd->overutilized &&
-                   cpu_overutilized(rq->cpu))
+                   cpu_overutilized(rq->cpu)) {
                         rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
  
                 /*
                  * We want to potentially trigger a freq switch
@@ -4272,9 +4312,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                         update_capacity_of(cpu_of(rq));
         }
  
-       /* Update SchedTune accouting */
-       schedtune_enqueue_task(p, cpu_of(rq));
-
  #endif /* CONFIG_SMP */
         hrtick_update(rq);
  }
@@ -4340,6 +4377,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  
  #ifdef CONFIG_SMP
  
+       /*
+        * Update SchedTune accounting
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
         if (!se) {
                 walt_dec_cumulative_runnable_avg(rq, p);
  
@@ -4359,9 +4405,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 }
         }
  
-       /* Update SchedTune accouting */
-       schedtune_dequeue_task(p, cpu_of(rq));
-
  #endif /* CONFIG_SMP */
  
         hrtick_update(rq);
@@ -5610,11 +5653,11 @@ done:
         return target;
  }
  
-static inline int find_best_target(struct task_struct *p, bool boosted)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
  {
         int iter_cpu;
         int target_cpu = -1;
-       int target_capacity = 0;
+       int target_util = 0;
         int backup_capacity = 0;
         int best_idle_cpu = -1;
         int best_idle_cstate = INT_MAX;
@@ -5628,7 +5671,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
                 int idle_idx;
  
                 /*
-                * favor higher cpus for boosted tasks
+                * Iterate from higher cpus for boosted tasks.
                  */
                 int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
  
@@ -5655,10 +5698,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
                         continue;
  #endif
                 /*
-                * For boosted tasks we favor idle cpus unconditionally to
+                * Unconditionally favoring tasks that prefer idle cpus to
                  * improve latency.
                  */
-               if (idle_cpu(i) && boosted) {
+               if (idle_cpu(i) && prefer_idle) {
                         if (best_idle_cpu < 0)
                                 best_idle_cpu = i;
                         continue;
@@ -5670,12 +5713,26 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
  
                 if (new_util < cur_capacity) {
                         if (cpu_rq(i)->nr_running) {
-                               if (target_capacity == 0 ||
-                                       target_capacity > cur_capacity) {
-                                       target_cpu = i;
-                                       target_capacity = cur_capacity;
+                               if (prefer_idle) {
+                                       /* Find a target cpu with highest
+                                        * utilization.
+                                        */
+                                       if (target_util == 0 ||
+                                               target_util < new_util) {
+                                               target_cpu = i;
+                                               target_util = new_util;
+                                       }
+                               } else {
+                                       /* Find a target cpu with lowest
+                                        * utilization.
+                                        */
+                                       if (target_util == 0 ||
+                                               target_util > new_util) {
+                                               target_cpu = i;
+                                               target_util = new_util;
+                                       }
                                 }
-                       } else if (!boosted) {
+                       } else if (!prefer_idle) {
                                 if (best_idle_cpu < 0 ||
                                         (sysctl_sched_cstate_aware &&
                                                 best_idle_cstate > idle_idx)) {
@@ -5685,12 +5742,13 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
                         }
                 } else if (backup_capacity == 0 ||
                                 backup_capacity > cur_capacity) {
+                       // Find a backup cpu with least capacity.
                         backup_capacity = cur_capacity;
                         backup_cpu = i;
                 }
         }
  
-       if (boosted && best_idle_cpu >= 0)
+       if (prefer_idle && best_idle_cpu >= 0)
                 target_cpu = best_idle_cpu;
         else if (target_cpu < 0)
                 target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
@@ -5782,14 +5840,17 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
                  */
  #ifdef CONFIG_CGROUP_SCHEDTUNE
                 bool boosted = schedtune_task_boost(p) > 0;
+               bool prefer_idle = schedtune_prefer_idle(p) > 0;
  #else
                 bool boosted = 0;
+               bool prefer_idle = 0;
  #endif
-               int tmp_target = find_best_target(p, boosted);
-               if (tmp_target >= 0)
+               int tmp_target = find_best_target(p, boosted, prefer_idle);
+               if (tmp_target >= 0) {
                         target_cpu = tmp_target;
-                       if (boosted && idle_cpu(target_cpu))
+                       if ((boosted || prefer_idle) && idle_cpu(target_cpu))
                                 return target_cpu;
+               }
         }
  
         if (target_cpu != task_cpu(p)) {
@@ -7067,7 +7128,10 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
                 mcc->cpu = cpu;
  #ifdef CONFIG_SCHED_DEBUG
                 raw_spin_unlock_irqrestore(&mcc->lock, flags);
-               pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+/*
+               printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+                               cpu, capacity);
+*/
                 goto skip_unlock;
  #endif
         }
@@ -7290,7 +7354,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                         bool *overload, bool *overutilized)
  {
         unsigned long load;
-       int i;
+       int i, nr_running;
  
         memset(sgs, 0, sizeof(*sgs));
  
@@ -7307,7 +7371,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->group_util += cpu_util(i);
                 sgs->sum_nr_running += rq->cfs.h_nr_running;
  
-               if (rq->nr_running > 1)
+               nr_running = rq->nr_running;
+               if (nr_running > 1)
                         *overload = true;
  
  #ifdef CONFIG_NUMA_BALANCING
@@ -7315,7 +7380,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                 sgs->nr_preferred_running += rq->nr_preferred_running;
  #endif
                 sgs->sum_weighted_load += weighted_cpuload(i);
-               if (idle_cpu(i))
+               /*
+                * No need to call idle_cpu() if nr_running is not 0
+                */
+               if (!nr_running && idle_cpu(i))
                         sgs->idle_cpus++;
  
                 if (cpu_overutilized(i)) {
@@ -7521,12 +7589,17 @@ next_group:
                         env->dst_rq->rd->overload = overload;
  
                 /* Update over-utilization (tipping point, U >= 0) indicator */
-               if (env->dst_rq->rd->overutilized != overutilized)
+               if (env->dst_rq->rd->overutilized != overutilized) {
                         env->dst_rq->rd->overutilized = overutilized;
+                       trace_sched_overutilized(overutilized);
+               }
         } else {
-               if (!env->dst_rq->rd->overutilized && overutilized)
+               if (!env->dst_rq->rd->overutilized && overutilized) {
                         env->dst_rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
         }
+
  }
  
  /**
@@ -8320,6 +8393,7 @@ static int idle_balance(struct rq *this_rq)
         struct sched_domain *sd;
         int pulled_task = 0;
         u64 curr_cost = 0;
+       long removed_util=0;
  
         idle_enter_fair(this_rq);
  
@@ -8343,6 +8417,17 @@ static int idle_balance(struct rq *this_rq)
  
         raw_spin_unlock(&this_rq->lock);
  
+       /*
+        * If removed_util_avg is !0 we most probably migrated some task away
+        * from this_cpu. In this case we might be willing to trigger an OPP
+        * update, but we want to do so if we don't find anybody else to pull
+        * here (we will trigger an OPP update with the pulled task's enqueue
+        * anyway).
+        *
+        * Record removed_util before calling update_blocked_averages, and use
+        * it below (before returning) to see if an OPP update is required.
+        */
+       removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
         update_blocked_averages(this_cpu);
         rcu_read_lock();
         for_each_domain(this_cpu, sd) {
@@ -8407,6 +8492,12 @@ out:
         if (pulled_task) {
                 idle_exit_fair(this_rq);
                 this_rq->idle_stamp = 0;
+       } else if (removed_util) {
+               /*
+                * No task pulled and someone has been migrated away.
+                * Good case to trigger an OPP update.
+                */
+               update_capacity_of(this_cpu);
         }
  
         return pulled_task;
@@ -8966,8 +9057,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                 task_tick_numa(rq, curr);
  
  #ifdef CONFIG_SMP
-       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
                 rq->rd->overutilized = true;
+               trace_sched_overutilized(true);
+       }
  
         rq->misfit_task = !task_fits_max(curr, rq->cpu);
  #endif