Merge branch 'linux-linaro-lsk-v4.4-android' of git://git.linaro.org/kernel/linux...
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index a45a6e1a692c9a6e1f1823a7ce46c7cbb58accce..f03f54704c6d6036ec7f73b065c5675a32dbbb2b 100644 (file)
@@ -1207,8 +1207,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
        if (env->best_task)
                put_task_struct(env->best_task);
-       if (p)
-               get_task_struct(p);
 
        env->best_task = p;
        env->best_imp = imp;
@@ -1276,20 +1274,30 @@ static void task_numa_compare(struct task_numa_env *env,
        long imp = env->p->numa_group ? groupimp : taskimp;
        long moveimp = imp;
        int dist = env->dist;
+       bool assigned = false;
 
        rcu_read_lock();
 
        raw_spin_lock_irq(&dst_rq->lock);
        cur = dst_rq->curr;
        /*
-        * No need to move the exiting task, and this ensures that ->curr
-        * wasn't reaped and thus get_task_struct() in task_numa_assign()
-        * is safe under RCU read lock.
-        * Note that rcu_read_lock() itself can't protect from the final
-        * put_task_struct() after the last schedule().
+        * No need to move the exiting task or idle task.
         */
        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
+       else {
+               /*
+                * The task_struct must be protected here to protect the
+                * p->numa_faults access in the task_weight since the
+                * numa_faults could already be freed in the following path:
+                * finish_task_switch()
+                *     --> put_task_struct()
+                *         --> __put_task_struct()
+                *             --> task_numa_free()
+                */
+               get_task_struct(cur);
+       }
+
        raw_spin_unlock_irq(&dst_rq->lock);
 
        /*
@@ -1373,6 +1381,7 @@ balance:
                 */
                if (!load_too_imbalanced(src_load, dst_load, env)) {
                        imp = moveimp - 1;
+                       put_task_struct(cur);
                        cur = NULL;
                        goto assign;
                }
@@ -1398,9 +1407,16 @@ balance:
                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 
 assign:
+       assigned = true;
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
+       /*
+        * The dst_rq->curr isn't assigned. The protection for task_struct is
+        * finished.
+        */
+       if (cur && !assigned)
+               put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2741,6 +2757,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
        cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
+       /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+       if (cfs_rq == &rq_of(cfs_rq)->cfs)
+               trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
        return decayed || removed;
 }
 
@@ -2764,7 +2784,6 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
        if (entity_is_task(se))
                trace_sched_load_avg_task(task_of(se), &se->avg);
-       trace_sched_load_avg_cpu(cpu, cfs_rq);
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4255,11 +4274,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+       /*
+        * Update SchedTune accounting.
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
        if (!se) {
                walt_inc_cumulative_runnable_avg(rq, p);
                if (!task_new && !rq->rd->overutilized &&
-                   cpu_overutilized(rq->cpu))
+                   cpu_overutilized(rq->cpu)) {
                        rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
 
                /*
                 * We want to potentially trigger a freq switch
@@ -4272,9 +4312,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                        update_capacity_of(cpu_of(rq));
        }
 
-       /* Update SchedTune accouting */
-       schedtune_enqueue_task(p, cpu_of(rq));
-
 #endif /* CONFIG_SMP */
        hrtick_update(rq);
 }
@@ -4340,6 +4377,15 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
 #ifdef CONFIG_SMP
 
+       /*
+        * Update SchedTune accounting
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
        if (!se) {
                walt_dec_cumulative_runnable_avg(rq, p);
 
@@ -4359,9 +4405,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                }
        }
 
-       /* Update SchedTune accouting */
-       schedtune_dequeue_task(p, cpu_of(rq));
-
 #endif /* CONFIG_SMP */
 
        hrtick_update(rq);
@@ -5610,11 +5653,11 @@ done:
        return target;
 }
 
-static inline int find_best_target(struct task_struct *p, bool boosted)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
 {
        int iter_cpu;
        int target_cpu = -1;
-       int target_capacity = 0;
+       int target_util = 0;
        int backup_capacity = 0;
        int best_idle_cpu = -1;
        int best_idle_cstate = INT_MAX;
@@ -5628,7 +5671,7 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
                int idle_idx;
 
                /*
-                * favor higher cpus for boosted tasks
+                * Iterate from higher cpus for boosted tasks.
                 */
                int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
 
@@ -5655,10 +5698,10 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
                        continue;
 #endif
                /*
-                * For boosted tasks we favor idle cpus unconditionally to
+                * Unconditionally favoring tasks that prefer idle cpus to
                 * improve latency.
                 */
-               if (idle_cpu(i) && boosted) {
+               if (idle_cpu(i) && prefer_idle) {
                        if (best_idle_cpu < 0)
                                best_idle_cpu = i;
                        continue;
@@ -5670,12 +5713,26 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
 
                if (new_util < cur_capacity) {
                        if (cpu_rq(i)->nr_running) {
-                               if (target_capacity == 0 ||
-                                       target_capacity > cur_capacity) {
-                                       target_cpu = i;
-                                       target_capacity = cur_capacity;
+                               if (prefer_idle) {
+                                       /* Find a target cpu with highest
+                                        * utilization.
+                                        */
+                                       if (target_util == 0 ||
+                                               target_util < new_util) {
+                                               target_cpu = i;
+                                               target_util = new_util;
+                                       }
+                               } else {
+                                       /* Find a target cpu with lowest
+                                        * utilization.
+                                        */
+                                       if (target_util == 0 ||
+                                               target_util > new_util) {
+                                               target_cpu = i;
+                                               target_util = new_util;
+                                       }
                                }
-                       } else if (!boosted) {
+                       } else if (!prefer_idle) {
                                if (best_idle_cpu < 0 ||
                                        (sysctl_sched_cstate_aware &&
                                                best_idle_cstate > idle_idx)) {
@@ -5685,12 +5742,13 @@ static inline int find_best_target(struct task_struct *p, bool boosted)
                        }
                } else if (backup_capacity == 0 ||
                                backup_capacity > cur_capacity) {
+                       // Find a backup cpu with least capacity.
                        backup_capacity = cur_capacity;
                        backup_cpu = i;
                }
        }
 
-       if (boosted && best_idle_cpu >= 0)
+       if (prefer_idle && best_idle_cpu >= 0)
                target_cpu = best_idle_cpu;
        else if (target_cpu < 0)
                target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
@@ -5782,14 +5840,17 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
                 */
 #ifdef CONFIG_CGROUP_SCHEDTUNE
                bool boosted = schedtune_task_boost(p) > 0;
+               bool prefer_idle = schedtune_prefer_idle(p) > 0;
 #else
                bool boosted = 0;
+               bool prefer_idle = 0;
 #endif
-               int tmp_target = find_best_target(p, boosted);
-               if (tmp_target >= 0)
+               int tmp_target = find_best_target(p, boosted, prefer_idle);
+               if (tmp_target >= 0) {
                        target_cpu = tmp_target;
-                       if (boosted && idle_cpu(target_cpu))
+                       if ((boosted || prefer_idle) && idle_cpu(target_cpu))
                                return target_cpu;
+               }
        }
 
        if (target_cpu != task_cpu(p)) {
@@ -7067,7 +7128,10 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
                mcc->cpu = cpu;
 #ifdef CONFIG_SCHED_DEBUG
                raw_spin_unlock_irqrestore(&mcc->lock, flags);
-               pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+/*
+               printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+                               cpu, capacity);
+*/
                goto skip_unlock;
 #endif
        }
@@ -7290,7 +7354,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        bool *overload, bool *overutilized)
 {
        unsigned long load;
-       int i;
+       int i, nr_running;
 
        memset(sgs, 0, sizeof(*sgs));
 
@@ -7307,7 +7371,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->group_util += cpu_util(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
 
-               if (rq->nr_running > 1)
+               nr_running = rq->nr_running;
+               if (nr_running > 1)
                        *overload = true;
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -7315,7 +7380,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
                sgs->sum_weighted_load += weighted_cpuload(i);
-               if (idle_cpu(i))
+               /*
+                * No need to call idle_cpu() if nr_running is not 0
+                */
+               if (!nr_running && idle_cpu(i))
                        sgs->idle_cpus++;
 
                if (cpu_overutilized(i)) {
@@ -7521,12 +7589,17 @@ next_group:
                        env->dst_rq->rd->overload = overload;
 
                /* Update over-utilization (tipping point, U >= 0) indicator */
-               if (env->dst_rq->rd->overutilized != overutilized)
+               if (env->dst_rq->rd->overutilized != overutilized) {
                        env->dst_rq->rd->overutilized = overutilized;
+                       trace_sched_overutilized(overutilized);
+               }
        } else {
-               if (!env->dst_rq->rd->overutilized && overutilized)
+               if (!env->dst_rq->rd->overutilized && overutilized) {
                        env->dst_rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
        }
+
 }
 
 /**
@@ -8320,6 +8393,7 @@ static int idle_balance(struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        u64 curr_cost = 0;
+       long removed_util=0;
 
        idle_enter_fair(this_rq);
 
@@ -8343,6 +8417,17 @@ static int idle_balance(struct rq *this_rq)
 
        raw_spin_unlock(&this_rq->lock);
 
+       /*
+        * If removed_util_avg is !0 we most probably migrated some task away
+        * from this_cpu. In this case we might be willing to trigger an OPP
+        * update, but we want to do so if we don't find anybody else to pull
+        * here (we will trigger an OPP update with the pulled task's enqueue
+        * anyway).
+        *
+        * Record removed_util before calling update_blocked_averages, and use
+        * it below (before returning) to see if an OPP update is required.
+        */
+       removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
        update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
@@ -8407,6 +8492,12 @@ out:
        if (pulled_task) {
                idle_exit_fair(this_rq);
                this_rq->idle_stamp = 0;
+       } else if (removed_util) {
+               /*
+                * No task pulled and someone has been migrated away.
+                * Good case to trigger an OPP update.
+                */
+               update_capacity_of(this_cpu);
        }
 
        return pulled_task;
@@ -8966,8 +9057,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                task_tick_numa(rq, curr);
 
 #ifdef CONFIG_SMP
-       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
                rq->rd->overutilized = true;
+               trace_sched_overutilized(true);
+       }
 
        rq->misfit_task = !task_fits_max(curr, rq->cpu);
 #endif