Merge branch 'linux-linaro-lsk-v4.4-android' of git://git.linaro.org/kernel/linux...
[firefly-linux-kernel-4.4.55.git] / kernel / sched / fair.c
index 08e2ef16d6001bf6176ff1e976fa899e2c8b7bbe..f03f54704c6d6036ec7f73b065c5675a32dbbb2b 100644 (file)
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/module.h>
 
 #include <trace/events/sched.h>
 
 #include "sched.h"
+#include "tune.h"
+#include "walt.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
 
+unsigned int sysctl_sched_is_big_little = 0;
+unsigned int sysctl_sched_sync_hint_enable = 1;
+unsigned int sysctl_sched_initial_task_util = 0;
+unsigned int sysctl_sched_cstate_aware = 1;
+
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
+#endif
 /*
  * The initial- and re-scaling of tunables is configurable
  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
@@ -682,7 +696,9 @@ void init_entity_runnable_average(struct sched_entity *se)
        sa->period_contrib = 1023;
        sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
-       sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
+       sa->util_avg =  sched_freq() ?
+               sysctl_sched_initial_task_util :
+               scale_load_down(SCHED_LOAD_SCALE);
        sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
        /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
 }
@@ -1191,8 +1207,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
        if (env->best_task)
                put_task_struct(env->best_task);
-       if (p)
-               get_task_struct(p);
 
        env->best_task = p;
        env->best_imp = imp;
@@ -1260,20 +1274,30 @@ static void task_numa_compare(struct task_numa_env *env,
        long imp = env->p->numa_group ? groupimp : taskimp;
        long moveimp = imp;
        int dist = env->dist;
+       bool assigned = false;
 
        rcu_read_lock();
 
        raw_spin_lock_irq(&dst_rq->lock);
        cur = dst_rq->curr;
        /*
-        * No need to move the exiting task, and this ensures that ->curr
-        * wasn't reaped and thus get_task_struct() in task_numa_assign()
-        * is safe under RCU read lock.
-        * Note that rcu_read_lock() itself can't protect from the final
-        * put_task_struct() after the last schedule().
+        * No need to move the exiting task or idle task.
         */
        if ((cur->flags & PF_EXITING) || is_idle_task(cur))
                cur = NULL;
+       else {
+               /*
+                * The task_struct must be protected here to protect the
+                * p->numa_faults access in the task_weight since the
+                * numa_faults could already be freed in the following path:
+                * finish_task_switch()
+                *     --> put_task_struct()
+                *         --> __put_task_struct()
+                *             --> task_numa_free()
+                */
+               get_task_struct(cur);
+       }
+
        raw_spin_unlock_irq(&dst_rq->lock);
 
        /*
@@ -1357,6 +1381,7 @@ balance:
                 */
                if (!load_too_imbalanced(src_load, dst_load, env)) {
                        imp = moveimp - 1;
+                       put_task_struct(cur);
                        cur = NULL;
                        goto assign;
                }
@@ -1382,9 +1407,16 @@ balance:
                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 
 assign:
+       assigned = true;
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
+       /*
+        * The dst_rq->curr isn't assigned. The protection for task_struct is
+        * finished.
+        */
+       if (cur && !assigned)
+               put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2584,6 +2616,7 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
        scale_freq = arch_scale_freq_capacity(NULL, cpu);
        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+       trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
 
        /* delta_w is the amount already accumulated against our next period */
        delta_w = sa->period_contrib;
@@ -2724,6 +2757,10 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
        cfs_rq->load_last_update_time_copy = sa->last_update_time;
 #endif
 
+       /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
+       if (cfs_rq == &rq_of(cfs_rq)->cfs)
+               trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
        return decayed || removed;
 }
 
@@ -2744,6 +2781,9 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
        if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
                update_tg_load_avg(cfs_rq, 0);
+
+       if (entity_is_task(se))
+               trace_sched_load_avg_task(task_of(se), &se->avg);
 }
 
 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -4160,7 +4200,27 @@ static inline void hrtick_update(struct rq *rq)
 }
 #endif
 
+#ifdef CONFIG_SMP
 static bool cpu_overutilized(int cpu);
+static inline unsigned long boosted_cpu_util(int cpu);
+#else
+#define boosted_cpu_util(cpu) cpu_util(cpu)
+#endif
+
+#ifdef CONFIG_SMP
+static void update_capacity_of(int cpu)
+{
+       unsigned long req_cap;
+
+       if (!sched_freq())
+               return;
+
+       /* Convert scale-invariant capacity to cpu. */
+       req_cap = boosted_cpu_util(cpu);
+       req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu);
+       set_cfs_cpu_capacity(cpu, true, req_cap);
+}
+#endif
 
 /*
  * The enqueue_task method is called before nr_running is
@@ -4172,7 +4232,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
-       int task_new = !(flags & ENQUEUE_WAKEUP);
+#ifdef CONFIG_SMP
+       int task_new = flags & ENQUEUE_WAKEUP_NEW;
+       int task_wakeup = flags & ENQUEUE_WAKEUP;
+#endif
 
        for_each_sched_entity(se) {
                if (se->on_rq)
@@ -4189,6 +4252,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                flags = ENQUEUE_WAKEUP;
        }
@@ -4196,6 +4260,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -4204,12 +4269,50 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                update_cfs_shares(cfs_rq);
        }
 
-       if (!se) {
+       if (!se)
                add_nr_running(rq, 1);
+
+#ifdef CONFIG_SMP
+
+       /*
+        * Update SchedTune accounting.
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
+       if (!se) {
+               walt_inc_cumulative_runnable_avg(rq, p);
                if (!task_new && !rq->rd->overutilized &&
-                   cpu_overutilized(rq->cpu))
+                   cpu_overutilized(rq->cpu)) {
                        rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
+
+               /*
+                * We want to potentially trigger a freq switch
+                * request only for tasks that are waking up; this is
+                * because we get here also during load balancing, but
+                * in these cases it seems wise to trigger as single
+                * request after load balancing is done.
+                */
+               if (task_new || task_wakeup)
+                       update_capacity_of(cpu_of(rq));
        }
+
+#endif /* CONFIG_SMP */
        hrtick_update(rq);
 }
 
@@ -4239,6 +4342,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
@@ -4259,6 +4363,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -4270,6 +4375,38 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        if (!se)
                sub_nr_running(rq, 1);
 
+#ifdef CONFIG_SMP
+
+       /*
+        * Update SchedTune accounting
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
+       if (!se) {
+               walt_dec_cumulative_runnable_avg(rq, p);
+
+               /*
+                * We want to potentially trigger a freq switch
+                * request only for tasks that are going to sleep;
+                * this is because we get here also during load
+                * balancing, but in these cases it seems wise to
+                * trigger as single request after load balancing is
+                * done.
+                */
+               if (task_sleep) {
+                       if (rq->cfs.nr_running)
+                               update_capacity_of(cpu_of(rq));
+                       else if (sched_freq())
+                               set_cfs_cpu_capacity(cpu_of(rq), false, 0);
+               }
+       }
+
+#endif /* CONFIG_SMP */
+
        hrtick_update(rq);
 }
 
@@ -4496,15 +4633,6 @@ static unsigned long target_load(int cpu, int type)
        return max(rq->cpu_load[type-1], total);
 }
 
-static unsigned long capacity_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity_orig;
-}
 
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
@@ -4682,56 +4810,13 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
  * Returns the current capacity of cpu after applying both
  * cpu and freq scaling.
  */
-static unsigned long capacity_curr_of(int cpu)
+unsigned long capacity_curr_of(int cpu)
 {
        return cpu_rq(cpu)->cpu_capacity_orig *
               arch_scale_freq_capacity(NULL, cpu)
               >> SCHED_CAPACITY_SHIFT;
 }
 
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static unsigned long __cpu_util(int cpu, int delta)
-{
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-       unsigned long capacity = capacity_orig_of(cpu);
-
-       delta += util;
-       if (delta < 0)
-               return 0;
-
-       return (delta >= capacity) ? capacity : delta;
-}
-
-static unsigned long cpu_util(int cpu)
-{
-       return __cpu_util(cpu, 0);
-}
-
 static inline bool energy_aware(void)
 {
        return sched_feat(ENERGY_AWARE);
@@ -4745,6 +4830,19 @@ struct energy_env {
        int                     src_cpu;
        int                     dst_cpu;
        int                     energy;
+       int                     payoff;
+       struct task_struct      *task;
+       struct {
+               int before;
+               int after;
+               int delta;
+               int diff;
+       } nrg;
+       struct {
+               int before;
+               int after;
+               int delta;
+       } cap;
 };
 
 /*
@@ -4911,6 +5009,22 @@ static int sched_group_energy(struct energy_env *eenv)
                                        eenv->sg_cap = sg;
 
                                cap_idx = find_new_capacity(eenv, sg->sge);
+
+                               if (sg->group_weight == 1) {
+                                       /* Remove capacity of src CPU (before task move) */
+                                       if (eenv->util_delta == 0 &&
+                                           cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
+                                               eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
+                                               eenv->cap.delta -= eenv->cap.before;
+                                       }
+                                       /* Add capacity of dst CPU  (after task move) */
+                                       if (eenv->util_delta != 0 &&
+                                           cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
+                                               eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
+                                               eenv->cap.delta += eenv->cap.after;
+                                       }
+                               }
+
                                idle_idx = group_idle_state(sg);
                                group_util = group_norm_util(eenv, sg);
                                sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
@@ -4930,6 +5044,7 @@ static int sched_group_energy(struct energy_env *eenv)
                        } while (sg = sg->next, sg != sd->groups);
                }
 next_cpu:
+               cpumask_clear_cpu(cpu, &visit_cpus);
                continue;
        }
 
@@ -4949,7 +5064,7 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
  * utilization is removed from or added to the system (e.g. task wake-up). If
  * both are specified, the utilization is migrated.
  */
-static inenergy_diff(struct energy_env *eenv)
+static inline int __energy_diff(struct energy_env *eenv)
 {
        struct sched_domain *sd;
        struct sched_group *sg;
@@ -4959,6 +5074,8 @@ static int energy_diff(struct energy_env *eenv)
                .util_delta     = 0,
                .src_cpu        = eenv->src_cpu,
                .dst_cpu        = eenv->dst_cpu,
+               .nrg            = { 0, 0, 0, 0},
+               .cap            = { 0, 0, 0 },
        };
 
        if (eenv->src_cpu == eenv->dst_cpu)
@@ -4980,15 +5097,101 @@ static int energy_diff(struct energy_env *eenv)
                                return 0; /* Invalid result abort */
                        energy_before += eenv_before.energy;
 
+                       /* Keep track of SRC cpu (before) capacity */
+                       eenv->cap.before = eenv_before.cap.before;
+                       eenv->cap.delta = eenv_before.cap.delta;
+
                        if (sched_group_energy(eenv))
                                return 0; /* Invalid result abort */
                        energy_after += eenv->energy;
                }
        } while (sg = sg->next, sg != sd->groups);
 
-       return energy_after-energy_before;
+       eenv->nrg.before = energy_before;
+       eenv->nrg.after = energy_after;
+       eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
+       eenv->payoff = 0;
+
+       trace_sched_energy_diff(eenv->task,
+                       eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
+                       eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
+                       eenv->cap.before, eenv->cap.after, eenv->cap.delta,
+                       eenv->nrg.delta, eenv->payoff);
+
+       return eenv->nrg.diff;
 }
 
+#ifdef CONFIG_SCHED_TUNE
+
+struct target_nrg schedtune_target_nrg;
+
+/*
+ * System energy normalization
+ * Returns the normalized value, in the range [0..SCHED_LOAD_SCALE],
+ * corresponding to the specified energy variation.
+ */
+static inline int
+normalize_energy(int energy_diff)
+{
+       u32 normalized_nrg;
+#ifdef CONFIG_SCHED_DEBUG
+       int max_delta;
+
+       /* Check for boundaries */
+       max_delta  = schedtune_target_nrg.max_power;
+       max_delta -= schedtune_target_nrg.min_power;
+       WARN_ON(abs(energy_diff) >= max_delta);
+#endif
+
+       /* Do scaling using positive numbers to increase the range */
+       normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
+
+       /* Scale by energy magnitude */
+       normalized_nrg <<= SCHED_LOAD_SHIFT;
+
+       /* Normalize on max energy for target platform */
+       normalized_nrg = reciprocal_divide(
+                       normalized_nrg, schedtune_target_nrg.rdiv);
+
+       return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
+}
+
+static inline int
+energy_diff(struct energy_env *eenv)
+{
+       int boost = schedtune_task_boost(eenv->task);
+       int nrg_delta;
+
+       /* Conpute "absolute" energy diff */
+       __energy_diff(eenv);
+
+       /* Return energy diff when boost margin is 0 */
+       if (boost == 0)
+               return eenv->nrg.diff;
+
+       /* Compute normalized energy diff */
+       nrg_delta = normalize_energy(eenv->nrg.diff);
+       eenv->nrg.delta = nrg_delta;
+
+       eenv->payoff = schedtune_accept_deltas(
+                       eenv->nrg.delta,
+                       eenv->cap.delta,
+                       eenv->task);
+
+       /*
+        * When SchedTune is enabled, the energy_diff() function will return
+        * the computed energy payoff value. Since the energy_diff() return
+        * value is expected to be negative by its callers, this evaluation
+        * function return a negative value each time the evaluation return a
+        * positive payoff, which is the condition for the acceptance of
+        * a scheduling decision
+        */
+       return -eenv->payoff;
+}
+#else /* CONFIG_SCHED_TUNE */
+#define energy_diff(eenv) __energy_diff(eenv)
+#endif
+
 /*
  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
  * A waker of many should wake a different task than the one last awakened
@@ -5082,16 +5285,24 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
 static inline unsigned long task_util(struct task_struct *p)
 {
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+               unsigned long demand = p->ravg.demand;
+               return (demand << 10) / walt_ravg_window;
+       }
+#endif
        return p->se.avg.util_avg;
 }
 
-static unsigned int capacity_margin = 1280; /* ~20% margin */
+unsigned int capacity_margin = 1280; /* ~20% margin */
+
+static inline unsigned long boosted_task_util(struct task_struct *task);
 
 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
 {
        unsigned long capacity = capacity_of(cpu);
 
-       util += task_util(p);
+       util += boosted_task_util(p);
 
        return (capacity * 1024) > (util * capacity_margin);
 }
@@ -5120,6 +5331,112 @@ static bool cpu_overutilized(int cpu)
        return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
 }
 
+#ifdef CONFIG_SCHED_TUNE
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+       long long margin = 0;
+
+       /*
+        * Signal proportional compensation (SPC)
+        *
+        * The Boost (B) value is used to compute a Margin (M) which is
+        * proportional to the complement of the original Signal (S):
+        *   M = B * (SCHED_LOAD_SCALE - S), if B is positive
+        *   M = B * S, if B is negative
+        * The obtained M could be used by the caller to "boost" S.
+        */
+       if (boost >= 0) {
+               margin  = SCHED_LOAD_SCALE - signal;
+               margin *= boost;
+       } else
+               margin = -signal * boost;
+       /*
+        * Fast integer division by constant:
+        *  Constant   :                 (C) = 100
+        *  Precision  : 0.1%            (P) = 0.1
+        *  Reference  : C * 100 / P     (R) = 100000
+        *
+        * Thus:
+        *  Shift bits : ceil(log(R,2))  (S) = 17
+        *  Mult const : round(2^S/C)    (M) = 1311
+        *
+        *
+        */
+       margin  *= 1311;
+       margin >>= 17;
+
+       if (boost < 0)
+               margin *= -1;
+       return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+       int boost = schedtune_cpu_boost(cpu);
+
+       if (boost == 0)
+               return 0;
+
+       return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+       int boost = schedtune_task_boost(task);
+       unsigned long util;
+       long margin;
+
+       if (boost == 0)
+               return 0;
+
+       util = task_util(task);
+       margin = schedtune_margin(util, boost);
+
+       return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+       return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+       return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+static inline unsigned long
+boosted_cpu_util(int cpu)
+{
+       unsigned long util = cpu_util(cpu);
+       long margin = schedtune_cpu_margin(util, cpu);
+
+       trace_sched_boost_cpu(cpu, util, margin);
+
+       return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+       unsigned long util = task_util(task);
+       long margin = schedtune_task_margin(task);
+
+       trace_sched_boost_task(task, util, margin);
+
+       return util + margin;
+}
+
 /*
  * find_idlest_group finds and returns the least busy CPU group within the
  * domain.
@@ -5271,15 +5588,20 @@ static int select_idle_sibling(struct task_struct *p, int target)
        struct sched_domain *sd;
        struct sched_group *sg;
        int i = task_cpu(p);
+       int best_idle = -1;
+       int best_idle_cstate = -1;
+       int best_idle_capacity = INT_MAX;
 
-       if (idle_cpu(target))
-               return target;
+       if (!sysctl_sched_cstate_aware) {
+               if (idle_cpu(target))
+                       return target;
 
-       /*
-        * If the prevous cpu is cache affine and idle, don't be stupid.
-        */
-       if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-               return i;
+               /*
+                * If the prevous cpu is cache affine and idle, don't be stupid.
+                */
+               if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+                       return i;
+       }
 
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -5292,30 +5614,165 @@ static int select_idle_sibling(struct task_struct *p, int target)
                                                tsk_cpus_allowed(p)))
                                goto next;
 
-                       for_each_cpu(i, sched_group_cpus(sg)) {
-                               if (i == target || !idle_cpu(i))
-                                       goto next;
-                       }
+                       if (sysctl_sched_cstate_aware) {
+                               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
+                                       struct rq *rq = cpu_rq(i);
+                                       int idle_idx = idle_get_state_idx(rq);
+                                       unsigned long new_usage = boosted_task_util(p);
+                                       unsigned long capacity_orig = capacity_orig_of(i);
+                                       if (new_usage > capacity_orig || !idle_cpu(i))
+                                               goto next;
+
+                                       if (i == target && new_usage <= capacity_curr_of(target))
+                                               return target;
+
+                                       if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) {
+                                               best_idle = i;
+                                               best_idle_cstate = idle_idx;
+                                               best_idle_capacity = capacity_orig;
+                                       }
+                               }
+                       } else {
+                               for_each_cpu(i, sched_group_cpus(sg)) {
+                                       if (i == target || !idle_cpu(i))
+                                               goto next;
+                               }
 
-                       target = cpumask_first_and(sched_group_cpus(sg),
+                               target = cpumask_first_and(sched_group_cpus(sg),
                                        tsk_cpus_allowed(p));
-                       goto done;
+                               goto done;
+                       }
 next:
                        sg = sg->next;
                } while (sg != sd->groups);
        }
+       if (best_idle > 0)
+               target = best_idle;
+
 done:
        return target;
 }
 
-static int energy_aware_wake_cpu(struct task_struct *p, int target)
+static inline int find_best_target(struct task_struct *p, bool boosted, bool prefer_idle)
+{
+       int iter_cpu;
+       int target_cpu = -1;
+       int target_util = 0;
+       int backup_capacity = 0;
+       int best_idle_cpu = -1;
+       int best_idle_cstate = INT_MAX;
+       int backup_cpu = -1;
+       unsigned long task_util_boosted, new_util;
+
+       task_util_boosted = boosted_task_util(p);
+       for (iter_cpu = 0; iter_cpu < NR_CPUS; iter_cpu++) {
+               int cur_capacity;
+               struct rq *rq;
+               int idle_idx;
+
+               /*
+                * Iterate from higher cpus for boosted tasks.
+                */
+               int i = boosted ? NR_CPUS-iter_cpu-1 : iter_cpu;
+
+               if (!cpu_online(i) || !cpumask_test_cpu(i, tsk_cpus_allowed(p)))
+                       continue;
+
+               /*
+                * p's blocked utilization is still accounted for on prev_cpu
+                * so prev_cpu will receive a negative bias due to the double
+                * accounting. However, the blocked utilization may be zero.
+                */
+               new_util = cpu_util(i) + task_util_boosted;
+
+               /*
+                * Ensure minimum capacity to grant the required boost.
+                * The target CPU can be already at a capacity level higher
+                * than the one required to boost the task.
+                */
+               if (new_util > capacity_orig_of(i))
+                       continue;
+
+#ifdef CONFIG_SCHED_WALT
+               if (walt_cpu_high_irqload(i))
+                       continue;
+#endif
+               /*
+                * Unconditionally favoring tasks that prefer idle cpus to
+                * improve latency.
+                */
+               if (idle_cpu(i) && prefer_idle) {
+                       if (best_idle_cpu < 0)
+                               best_idle_cpu = i;
+                       continue;
+               }
+
+               cur_capacity = capacity_curr_of(i);
+               rq = cpu_rq(i);
+               idle_idx = idle_get_state_idx(rq);
+
+               if (new_util < cur_capacity) {
+                       if (cpu_rq(i)->nr_running) {
+                               if (prefer_idle) {
+                                       /* Find a target cpu with highest
+                                        * utilization.
+                                        */
+                                       if (target_util == 0 ||
+                                               target_util < new_util) {
+                                               target_cpu = i;
+                                               target_util = new_util;
+                                       }
+                               } else {
+                                       /* Find a target cpu with lowest
+                                        * utilization.
+                                        */
+                                       if (target_util == 0 ||
+                                               target_util > new_util) {
+                                               target_cpu = i;
+                                               target_util = new_util;
+                                       }
+                               }
+                       } else if (!prefer_idle) {
+                               if (best_idle_cpu < 0 ||
+                                       (sysctl_sched_cstate_aware &&
+                                               best_idle_cstate > idle_idx)) {
+                                       best_idle_cstate = idle_idx;
+                                       best_idle_cpu = i;
+                               }
+                       }
+               } else if (backup_capacity == 0 ||
+                               backup_capacity > cur_capacity) {
+                       // Find a backup cpu with least capacity.
+                       backup_capacity = cur_capacity;
+                       backup_cpu = i;
+               }
+       }
+
+       if (prefer_idle && best_idle_cpu >= 0)
+               target_cpu = best_idle_cpu;
+       else if (target_cpu < 0)
+               target_cpu = best_idle_cpu >= 0 ? best_idle_cpu : backup_cpu;
+
+       return target_cpu;
+}
+
+static int energy_aware_wake_cpu(struct task_struct *p, int target, int sync)
 {
        struct sched_domain *sd;
        struct sched_group *sg, *sg_target;
        int target_max_cap = INT_MAX;
        int target_cpu = task_cpu(p);
+       unsigned long task_util_boosted, new_util;
        int i;
 
+       if (sysctl_sched_sync_hint_enable && sync) {
+               int cpu = smp_processor_id();
+               cpumask_t search_cpus;
+               cpumask_and(&search_cpus, tsk_cpus_allowed(p), cpu_online_mask);
+               if (cpumask_test_cpu(cpu, &search_cpus))
+                       return cpu;
+       }
+
        sd = rcu_dereference(per_cpu(sd_ea, task_cpu(p)));
 
        if (!sd)
@@ -5324,50 +5781,76 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
        sg = sd->groups;
        sg_target = sg;
 
-       /*
-        * Find group with sufficient capacity. We only get here if no cpu is
-        * overutilized. We may end up overutilizing a cpu by adding the task,
-        * but that should not be any worse than select_idle_sibling().
-        * load_balance() should sort it out later as we get above the tipping
-        * point.
-        */
-       do {
-               /* Assuming all cpus are the same in group */
-               int max_cap_cpu = group_first_cpu(sg);
+       if (sysctl_sched_is_big_little) {
 
                /*
-                * Assume smaller max capacity means more energy-efficient.
-                * Ideally we should query the energy model for the right
-                * answer but it easily ends up in an exhaustive search.
+                * Find group with sufficient capacity. We only get here if no cpu is
+                * overutilized. We may end up overutilizing a cpu by adding the task,
+                * but that should not be any worse than select_idle_sibling().
+                * load_balance() should sort it out later as we get above the tipping
+                * point.
                 */
-               if (capacity_of(max_cap_cpu) < target_max_cap &&
-                   task_fits_max(p, max_cap_cpu)) {
-                       sg_target = sg;
-                       target_max_cap = capacity_of(max_cap_cpu);
-               }
-       } while (sg = sg->next, sg != sd->groups);
+               do {
+                       /* Assuming all cpus are the same in group */
+                       int max_cap_cpu = group_first_cpu(sg);
 
-       /* Find cpu with sufficient capacity */
-       for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
-               /*
-                * p's blocked utilization is still accounted for on prev_cpu
-                * so prev_cpu will receive a negative bias due to the double
-                * accounting. However, the blocked utilization may be zero.
-                */
-               int new_util = cpu_util(i) + task_util(p);
+                       /*
+                        * Assume smaller max capacity means more energy-efficient.
+                        * Ideally we should query the energy model for the right
+                        * answer but it easily ends up in an exhaustive search.
+                        */
+                       if (capacity_of(max_cap_cpu) < target_max_cap &&
+                           task_fits_max(p, max_cap_cpu)) {
+                               sg_target = sg;
+                               target_max_cap = capacity_of(max_cap_cpu);
+                       }
+               } while (sg = sg->next, sg != sd->groups);
 
-               if (new_util > capacity_orig_of(i))
-                       continue;
+               task_util_boosted = boosted_task_util(p);
+               /* Find cpu with sufficient capacity */
+               for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg_target)) {
+                       /*
+                        * p's blocked utilization is still accounted for on prev_cpu
+                        * so prev_cpu will receive a negative bias due to the double
+                        * accounting. However, the blocked utilization may be zero.
+                        */
+                       new_util = cpu_util(i) + task_util_boosted;
 
-               if (new_util < capacity_curr_of(i)) {
-                       target_cpu = i;
-                       if (cpu_rq(i)->nr_running)
-                               break;
-               }
+                       /*
+                        * Ensure minimum capacity to grant the required boost.
+                        * The target CPU can be already at a capacity level higher
+                        * than the one required to boost the task.
+                        */
+                       if (new_util > capacity_orig_of(i))
+                               continue;
 
-               /* cpu has capacity at higher OPP, keep it as fallback */
-               if (target_cpu == task_cpu(p))
-                       target_cpu = i;
+                       if (new_util < capacity_curr_of(i)) {
+                               target_cpu = i;
+                               if (cpu_rq(i)->nr_running)
+                                       break;
+                       }
+
+                       /* cpu has capacity at higher OPP, keep it as fallback */
+                       if (target_cpu == task_cpu(p))
+                               target_cpu = i;
+               }
+       } else {
+               /*
+                * Find a cpu with sufficient capacity
+                */
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+               bool boosted = schedtune_task_boost(p) > 0;
+               bool prefer_idle = schedtune_prefer_idle(p) > 0;
+#else
+               bool boosted = 0;
+               bool prefer_idle = 0;
+#endif
+               int tmp_target = find_best_target(p, boosted, prefer_idle);
+               if (tmp_target >= 0) {
+                       target_cpu = tmp_target;
+                       if ((boosted || prefer_idle) && idle_cpu(target_cpu))
+                               return target_cpu;
+               }
        }
 
        if (target_cpu != task_cpu(p)) {
@@ -5375,6 +5858,7 @@ static int energy_aware_wake_cpu(struct task_struct *p, int target)
                        .util_delta     = task_util(p),
                        .src_cpu        = task_cpu(p),
                        .dst_cpu        = target_cpu,
+                       .task           = p,
                };
 
                /* Not enough spare capacity on previous cpu */
@@ -5443,7 +5927,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
        if (!sd) {
                if (energy_aware() && !cpu_rq(cpu)->rd->overutilized)
-                       new_cpu = energy_aware_wake_cpu(p, prev_cpu);
+                       new_cpu = energy_aware_wake_cpu(p, prev_cpu, sync);
                else if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                        new_cpu = select_idle_sibling(p, new_cpu);
 
@@ -5514,6 +5998,8 @@ static void task_dead_fair(struct task_struct *p)
 {
        remove_entity_load_avg(&p->se);
 }
+#else
+#define task_fits_max(p, cpu) true
 #endif /* CONFIG_SMP */
 
 static unsigned long
@@ -5760,6 +6246,8 @@ again:
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
 
+       rq->misfit_task = !task_fits_max(p, rq->cpu);
+
        return p;
 simple:
        cfs_rq = &rq->cfs;
@@ -5781,9 +6269,12 @@ simple:
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
 
+       rq->misfit_task = !task_fits_max(p, rq->cpu);
+
        return p;
 
 idle:
+       rq->misfit_task = 0;
        /*
         * This is OK, because current is on_cpu, which avoids it being picked
         * for load-balance and preemption/IRQs are still disabled avoiding
@@ -5996,6 +6487,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 
 enum fbq_type { regular, remote, all };
 
+enum group_type {
+       group_other = 0,
+       group_misfit_task,
+       group_imbalanced,
+       group_overloaded,
+};
+
 #define LBF_ALL_PINNED 0x01
 #define LBF_NEED_BREAK 0x02
 #define LBF_DST_PINNED  0x04
@@ -6025,6 +6523,7 @@ struct lb_env {
        unsigned int            loop_max;
 
        enum fbq_type           fbq_type;
+       enum group_type         busiest_group_type;
        struct list_head        tasks;
 };
 
@@ -6206,7 +6705,9 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 
        deactivate_task(env->src_rq, p, 0);
        p->on_rq = TASK_ON_RQ_MIGRATING;
+       double_lock_balance(env->src_rq, env->dst_rq);
        set_task_cpu(p, env->dst_cpu);
+       double_unlock_balance(env->src_rq, env->dst_rq);
 }
 
 /*
@@ -6351,6 +6852,10 @@ static void attach_one_task(struct rq *rq, struct task_struct *p)
 {
        raw_spin_lock(&rq->lock);
        attach_task(rq, p);
+       /*
+        * We want to potentially raise target_cpu's OPP.
+        */
+       update_capacity_of(cpu_of(rq));
        raw_spin_unlock(&rq->lock);
 }
 
@@ -6372,6 +6877,11 @@ static void attach_tasks(struct lb_env *env)
                attach_task(env->dst_rq, p);
        }
 
+       /*
+        * We want to potentially raise env.dst_cpu's OPP.
+        */
+       update_capacity_of(env->dst_cpu);
+
        raw_spin_unlock(&env->dst_rq->lock);
 }
 
@@ -6467,12 +6977,6 @@ static unsigned long task_h_load(struct task_struct *p)
 
 /********** Helpers for find_busiest_group ************************/
 
-enum group_type {
-       group_other = 0,
-       group_imbalanced,
-       group_overloaded,
-};
-
 /*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
@@ -6488,6 +6992,7 @@ struct sg_lb_stats {
        unsigned int group_weight;
        enum group_type group_type;
        int group_no_capacity;
+       int group_misfit_task; /* A cpu has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
        unsigned int nr_preferred_running;
@@ -6579,6 +7084,14 @@ static unsigned long scale_rt_capacity(int cpu)
 
        used = div_u64(avg, total);
 
+       /*
+        * deadline bandwidth is defined at system level so we must
+        * weight this bandwidth with the max capacity of the system.
+        * As a reminder, avg_bw is 20bits width and
+        * scale_cpu_capacity is 10 bits width
+        */
+       used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
+
        if (likely(used < SCHED_CAPACITY_SCALE))
                return SCHED_CAPACITY_SCALE - used;
 
@@ -6615,7 +7128,10 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
                mcc->cpu = cpu;
 #ifdef CONFIG_SCHED_DEBUG
                raw_spin_unlock_irqrestore(&mcc->lock, flags);
-               pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+/*
+               printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
+                               cpu, capacity);
+*/
                goto skip_unlock;
 #endif
        }
@@ -6630,13 +7146,14 @@ skip_unlock: __attribute__ ((unused));
 
        cpu_rq(cpu)->cpu_capacity = capacity;
        sdg->sgc->capacity = capacity;
+       sdg->sgc->max_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-       unsigned long capacity;
+       unsigned long capacity, max_capacity;
        unsigned long interval;
 
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -6649,6 +7166,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
        }
 
        capacity = 0;
+       max_capacity = 0;
 
        if (child->flags & SD_OVERLAP) {
                /*
@@ -6673,11 +7191,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                         */
                        if (unlikely(!rq->sd)) {
                                capacity += capacity_of(cpu);
-                               continue;
+                       } else {
+                               sgc = rq->sd->groups->sgc;
+                               capacity += sgc->capacity;
                        }
 
-                       sgc = rq->sd->groups->sgc;
-                       capacity += sgc->capacity;
+                       max_capacity = max(capacity, max_capacity);
                }
        } else  {
                /*
@@ -6687,12 +7206,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
                group = child->groups;
                do {
-                       capacity += group->sgc->capacity;
+                       struct sched_group_capacity *sgc = group->sgc;
+
+                       capacity += sgc->capacity;
+                       max_capacity = max(sgc->max_capacity, max_capacity);
                        group = group->next;
                } while (group != child->groups);
        }
 
        sdg->sgc->capacity = capacity;
+       sdg->sgc->max_capacity = max_capacity;
 }
 
 /*
@@ -6787,6 +7310,18 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
        return false;
 }
 
+
+/*
+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-cpu capacity than sched_group ref.
+ */
+static inline bool
+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+       return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
+                                                       ref->sgc->max_capacity;
+}
+
 static inline enum
 group_type group_classify(struct sched_group *group,
                          struct sg_lb_stats *sgs)
@@ -6797,6 +7332,9 @@ group_type group_classify(struct sched_group *group,
        if (sg_imbalanced(group))
                return group_imbalanced;
 
+       if (sgs->group_misfit_task)
+               return group_misfit_task;
+
        return group_other;
 }
 
@@ -6816,7 +7354,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        bool *overload, bool *overutilized)
 {
        unsigned long load;
-       int i;
+       int i, nr_running;
 
        memset(sgs, 0, sizeof(*sgs));
 
@@ -6833,7 +7371,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->group_util += cpu_util(i);
                sgs->sum_nr_running += rq->cfs.h_nr_running;
 
-               if (rq->nr_running > 1)
+               nr_running = rq->nr_running;
+               if (nr_running > 1)
                        *overload = true;
 
 #ifdef CONFIG_NUMA_BALANCING
@@ -6841,11 +7380,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->nr_preferred_running += rq->nr_preferred_running;
 #endif
                sgs->sum_weighted_load += weighted_cpuload(i);
-               if (idle_cpu(i))
+               /*
+                * No need to call idle_cpu() if nr_running is not 0
+                */
+               if (!nr_running && idle_cpu(i))
                        sgs->idle_cpus++;
 
-               if (cpu_overutilized(i))
+               if (cpu_overutilized(i)) {
                        *overutilized = true;
+                       if (!sgs->group_misfit_task && rq->misfit_task)
+                               sgs->group_misfit_task = capacity_of(i);
+               }
        }
 
        /* Adjust by relative CPU capacity of the group */
@@ -6887,9 +7432,25 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        if (sgs->group_type < busiest->group_type)
                return false;
 
+       /*
+        * Candidate sg doesn't face any serious load-balance problems
+        * so don't pick it if the local sg is already filled up.
+        */
+       if (sgs->group_type == group_other &&
+           !group_has_capacity(env, &sds->local_stat))
+               return false;
+
        if (sgs->avg_load <= busiest->avg_load)
                return false;
 
+       /*
+        * Candiate sg has no more than one task per cpu and has higher
+        * per-cpu capacity. No reason to pull tasks to less capable cpus.
+        */
+       if (sgs->sum_nr_running <= sgs->group_weight &&
+           group_smaller_cpu_capacity(sds->local, sg))
+               return false;
+
        /* This is the busiest node in its class. */
        if (!(env->sd->flags & SD_ASYM_PACKING))
                return true;
@@ -6995,6 +7556,15 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                        sgs->group_type = group_classify(sg, sgs);
                }
 
+               /*
+                * Ignore task groups with misfit tasks if local group has no
+                * capacity or if per-cpu capacity isn't higher.
+                */
+               if (sgs->group_type == group_misfit_task &&
+                   (!group_has_capacity(env, &sds->local_stat) ||
+                    !group_smaller_cpu_capacity(sg, sds->local)))
+                       sgs->group_type = group_other;
+
                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
                        sds->busiest = sg;
                        sds->busiest_stat = *sgs;
@@ -7019,12 +7589,17 @@ next_group:
                        env->dst_rq->rd->overload = overload;
 
                /* Update over-utilization (tipping point, U >= 0) indicator */
-               if (env->dst_rq->rd->overutilized != overutilized)
+               if (env->dst_rq->rd->overutilized != overutilized) {
                        env->dst_rq->rd->overutilized = overutilized;
+                       trace_sched_overutilized(overutilized);
+               }
        } else {
-               if (!env->dst_rq->rd->overutilized && overutilized)
+               if (!env->dst_rq->rd->overutilized && overutilized) {
                        env->dst_rq->rd->overutilized = true;
+                       trace_sched_overutilized(true);
+               }
        }
+
 }
 
 /**
@@ -7171,6 +7746,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         */
        if (busiest->avg_load <= sds->avg_load ||
            local->avg_load >= sds->avg_load) {
+               /* Misfitting tasks should be migrated in any case */
+               if (busiest->group_type == group_misfit_task) {
+                       env->imbalance = busiest->group_misfit_task;
+                       return;
+               }
+
+               /*
+                * Busiest group is overloaded, local is not, use the spare
+                * cycles to maximize throughput
+                */
+               if (busiest->group_type == group_overloaded &&
+                   local->group_type <= group_misfit_task) {
+                       env->imbalance = busiest->load_per_task;
+                       return;
+               }
+
                env->imbalance = 0;
                return fix_small_imbalance(env, sds);
        }
@@ -7204,6 +7795,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                (sds->avg_load - local->avg_load) * local->group_capacity
        ) / SCHED_CAPACITY_SCALE;
 
+       /* Boost imbalance to allow misfit task to be balanced. */
+       if (busiest->group_type == group_misfit_task)
+               env->imbalance = max_t(long, env->imbalance,
+                                    busiest->group_misfit_task);
+
        /*
         * if *imbalance is less than the average load per runnable task
         * there is no guarantee that any tasks will be moved so we'll have
@@ -7277,6 +7873,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
            busiest->group_no_capacity)
                goto force_balance;
 
+       /* Misfitting tasks should be dealt with regardless of the avg load */
+       if (busiest->group_type == group_misfit_task) {
+               goto force_balance;
+       }
+
        /*
         * If the local group is busier than the selected busiest group
         * don't try and pull any tasks.
@@ -7300,7 +7901,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                 * might end up to just move the imbalance on another group
                 */
                if ((busiest->group_type != group_overloaded) &&
-                               (local->idle_cpus <= (busiest->idle_cpus + 1)))
+                   (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
+                   !group_smaller_cpu_capacity(sds.busiest, sds.local))
                        goto out_balanced;
        } else {
                /*
@@ -7313,6 +7915,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        }
 
 force_balance:
+       env->busiest_group_type = busiest->group_type;
        /* Looks like there is an imbalance. Compute it */
        calculate_imbalance(env, &sds);
        return sds.busiest;
@@ -7371,7 +7974,8 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 */
 
                if (rq->nr_running == 1 && wl > env->imbalance &&
-                   !check_cpu_capacity(rq, env->sd))
+                   !check_cpu_capacity(rq, env->sd) &&
+                   env->busiest_group_type != group_misfit_task)
                        continue;
 
                /*
@@ -7560,6 +8164,11 @@ more_balance:
                 * ld_moved     - cumulative load moved across iterations
                 */
                cur_ld_moved = detach_tasks(&env);
+               /*
+                * We want to potentially lower env.src_cpu's OPP.
+                */
+               if (cur_ld_moved)
+                       update_capacity_of(env.src_cpu);
 
                /*
                 * We've detached some tasks from busiest_rq. Every
@@ -7784,6 +8393,7 @@ static int idle_balance(struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        u64 curr_cost = 0;
+       long removed_util=0;
 
        idle_enter_fair(this_rq);
 
@@ -7793,8 +8403,9 @@ static int idle_balance(struct rq *this_rq)
         */
        this_rq->idle_stamp = rq_clock(this_rq);
 
-       if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-           !this_rq->rd->overload) {
+       if (!energy_aware() &&
+           (this_rq->avg_idle < sysctl_sched_migration_cost ||
+            !this_rq->rd->overload)) {
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
@@ -7806,6 +8417,17 @@ static int idle_balance(struct rq *this_rq)
 
        raw_spin_unlock(&this_rq->lock);
 
+       /*
+        * If removed_util_avg is !0 we most probably migrated some task away
+        * from this_cpu. In this case we might be willing to trigger an OPP
+        * update, but we want to do so if we don't find anybody else to pull
+        * here (we will trigger an OPP update with the pulled task's enqueue
+        * anyway).
+        *
+        * Record removed_util before calling update_blocked_averages, and use
+        * it below (before returning) to see if an OPP update is required.
+        */
+       removed_util = atomic_long_read(&(this_rq->cfs).removed_util_avg);
        update_blocked_averages(this_cpu);
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
@@ -7870,6 +8492,12 @@ out:
        if (pulled_task) {
                idle_exit_fair(this_rq);
                this_rq->idle_stamp = 0;
+       } else if (removed_util) {
+               /*
+                * No task pulled and someone has been migrated away.
+                * Good case to trigger an OPP update.
+                */
+               update_capacity_of(this_cpu);
        }
 
        return pulled_task;
@@ -7929,8 +8557,13 @@ static int active_load_balance_cpu_stop(void *data)
                schedstat_inc(sd, alb_count);
 
                p = detach_one_task(&env);
-               if (p)
+               if (p) {
                        schedstat_inc(sd, alb_pushed);
+                       /*
+                        * We want to potentially lower env.src_cpu's OPP.
+                        */
+                       update_capacity_of(env.src_cpu);
+               }
                else
                        schedstat_inc(sd, alb_failed);
        }
@@ -8423,8 +9056,15 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
        if (static_branch_unlikely(&sched_numa_balancing))
                task_tick_numa(rq, curr);
 
-       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr)))
+#ifdef CONFIG_SMP
+       if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
                rq->rd->overutilized = true;
+               trace_sched_overutilized(true);
+       }
+
+       rq->misfit_task = !task_fits_max(curr, rq->cpu);
+#endif
+
 }
 
 /*